# Working in Impala
## Step 1 is to connect to the correct host
Using impyla seems to mimics the impala-shell (CLI) pretty closely.

Some important points to keep in mind/equivalencies

* `impala-shell -i host:port` is equivalent to `connect(host=hostname,port=portnumber)`
* `cursor = connect(...).cursor()` yields access to the CLI via:
    - `cursor.execute(query)` followed by
    - `cursor.fetchall()` to return the results
* note that the queries don't have to end in a semicolon.    


In [None]:
# connect to the correct host
#myhostname = 'ac00h1pdata02.opr.statefarm.org' #hostname of G-building
#myhostname = '
#portnum = 21050 # default port number ??

In [22]:
# connect to the correct host
myhostname = 'da74wbdn02.opr.statefarm.org' #hostname of PHDac00h1pdata02.opr.statefarm.org' #hostname of G-building
#myhostname = '' 
#you may use any of the datanodes from da74wbdn01 to da74wbdn27
portnum = 21050 # default port number ??

In [23]:
from impala.dbapi import connect
#conn = connect(host=myhostname, port=portnum,use_kerberos=True)
conn = connect(host=myhostname, port=portnum,auth_mechanism="GSSAPI")
cur = conn.cursor()

In [24]:
cur.execute('SHOW databases')
cur.fetchall()

[('a224_db',),
 ('claiment',),
 ('claimsff',),
 ('d0xy_db',),
 ('default',),
 ('du3z_db',),
 ('fdwcfpoc',),
 ('fm61_db',),
 ('hrsepara',),
 ('kesj_db',),
 ('lapsecan',),
 ('qepz_db',),
 ('sqqq_db',),
 ('totaloss',),
 ('vehrepat',)]

## Some utility functions 
1. list databases
    * type `cursor.execute("use dbasename")` to change to database
2. list tables
3. count lines in a table

In [4]:
# list databases
def list_databases(cursor):
    """
    input: the cursor connection input
    output: list of tables in active database
    """
    cursor.execute('SHOW databases')
    db_list = cursor.fetchall()
    db_names = [db[0] for db in db_list]
    return db_names

# list tables
def list_tables(cursor):
    """
    input: the cursor connection input
    output: list of tables in active database
    """
    cursor.execute('SHOW tables')
    tbl_list = cursor.fetchall()
    tbl_names = [table[0] for table in tbl_list]
    tbl_names
    return tbl_names

# count the number of lines in a given table
def count_lines(cursor,tablename):
    qstring = "select count (*) from "+tablename
    
    cursor.execute(qstring)
    nlines = cursor.fetchall()[0][0]
    return nlines

In [6]:
# list the databases
list_databases(cur)

['a224_db',
 'claiment',
 'claimsff',
 'd0xy_db',
 'default',
 'du3z_db',
 'fm61_db',
 'hrsepara',
 'kesj_db',
 'lapsecan',
 'qepz_db',
 'sqqq_db',
 'vehrepat']

In [7]:
# change to desired database
cur.execute("use vehrepat")

In [8]:
count_lines(cur,'auto_est_sect')
#cur.execute("select count (*) from auto_est_table")
#cur.fetchall()

542

In [9]:
my_tables=list_tables(cur)
print len(my_tables)
my_tables

19


['auto_est_sect',
 'auto_est_sum',
 'auto_est_sum_old',
 'auto_est_team',
 'auto_est_user',
 'auto_rpr_fac',
 'detl',
 'est_party',
 'est_party_detl_rltn',
 'lbr_note',
 'los_est',
 'msg',
 'non_oem',
 'opt',
 'p3533eeb_detl',
 'rate',
 'tax',
 'ttl',
 'y1753caaestparty']

# open Auto_EST_SECT

### how to describe a table

In [10]:
cur.execute('Describe auto_est_sect')
cur.fetchall()

[('auto_est_sect_dim_id',
  'bigint',
  'inferred from: optional int64 AUTO_EST_SECT_DIM_ID'),
 ('src_sys_dlte_ind',
  'string',
  'inferred from: optional binary SRC_SYS_DLTE_IND (UTF8)'),
 ('est_rgn_cd', 'string', 'inferred from: optional binary EST_RGN_CD (UTF8)'),
 ('est_sect_cd',
  'string',
  'inferred from: optional binary EST_SECT_CD (UTF8)'),
 ('est_sect_nm',
  'string',
  'inferred from: optional binary EST_SECT_NM (UTF8)'),
 ('sect_mgr_sgnon_id',
  'string',
  'inferred from: optional binary SECT_MGR_SGNON_ID (UTF8)'),
 ('mod_tstmp', 'bigint', 'inferred from: optional int64 MOD_TSTMP'),
 ('dlte_ind', 'string', 'inferred from: optional binary DLTE_IND (UTF8)'),
 ('data_cntxt_cd',
  'string',
  'inferred from: optional binary DATA_CNTXT_CD (UTF8)'),
 ('fdw_rplc_ind',
  'string',
  'inferred from: optional binary FDW_RPLC_IND (UTF8)'),
 ('fdw_insrt_tstmp',
  'bigint',
  'inferred from: optional int64 FDW_INSRT_TSTMP'),
 ('fdw_rplc_tstmp', 'bigint', 'inferred from: optional int6

### run a sql query on a table

In [11]:
cur.execute('SELECT * FROM auto_est_sect LIMIT 10')
cur.fetchall()

[(1,
  'N',
  '01',
  '01',
  'Poulard IL',
  'CELP',
  1279417510399,
  '',
  'AEORG',
  'N',
  1290612612046,
  253402322399999,
  1279417510399,
  1419318338248),
 (2,
  'N',
  '01',
  '02',
  'DEVOSS',
  'BJOR',
  1089137851898,
  '',
  'AEORG',
  'N',
  1290612612046,
  253402322399999,
  1089137851898,
  253402322399999),
 (3,
  'N',
  '01',
  '03',
  'TUTTLE',
  'BOHV',
  1265214111619,
  'D',
  'AEORG',
  'N',
  1290612612046,
  253402322399999,
  1265214111619,
  253402322399999),
 (4,
  'N',
  '01',
  '04',
  'KONSTANTINOPOULOS',
  'CH7H',
  966369413226,
  'D',
  'AEORG',
  'N',
  1290612612046,
  253402322399999,
  966369413226,
  253402322399999),
 (5,
  'N',
  '01',
  '05',
  'CLIPPER',
  'BOQ2',
  908832198210,
  'D',
  'AEORG',
  'N',
  1290612612046,
  253402322399999,
  908832198210,
  253402322399999),
 (6,
  'N',
  '01',
  '06',
  'JOHN LOUIE',
  'C5FA',
  920436885499,
  'D',
  'AEORG',
  'N',
  1290612612046,
  253402322399999,
  920436885499,
  253402322399999),


## GETTING data as pandas DF

In [12]:
from impala.util import as_pandas
cur.execute('SELECT * FROM auto_est_sect LIMIT 2000')
df = as_pandas(cur)
type(df)

pandas.core.frame.DataFrame

In [13]:
df.head()

Unnamed: 0,auto_est_sect_dim_id,src_sys_dlte_ind,est_rgn_cd,est_sect_cd,est_sect_nm,sect_mgr_sgnon_id,mod_tstmp,dlte_ind,data_cntxt_cd,fdw_rplc_ind,fdw_insrt_tstmp,fdw_rplc_tstmp,src_insrt_tstmp,src_rplc_tstmp
0,1,N,1,1,Poulard IL,CELP,1279417510399,,AEORG,N,1290612612046,253402322399999,1279417510399,1419318338248
1,2,N,1,2,DEVOSS,BJOR,1089137851898,,AEORG,N,1290612612046,253402322399999,1089137851898,253402322399999
2,3,N,1,3,TUTTLE,BOHV,1265214111619,D,AEORG,N,1290612612046,253402322399999,1265214111619,253402322399999
3,4,N,1,4,KONSTANTINOPOULOS,CH7H,966369413226,D,AEORG,N,1290612612046,253402322399999,966369413226,253402322399999
4,5,N,1,5,CLIPPER,BOQ2,908832198210,D,AEORG,N,1290612612046,253402322399999,908832198210,253402322399999


In [14]:
import pandas as pd
# import pandas
%matplotlib inline
import matplotlib.pyplot as plt

In [15]:
epoch_factor = 1000000 #example of how to convert Epoch time to timestamps 
# note this doesn't work for end of time values -- probably need to write a function (see below)
df.src_insrt_tstmp.apply(lambda x: pd.to_datetime(x*epoch_factor)) 

0     2010-07-18 01:45:10.399
1     2004-07-06 18:17:31.898
2     2010-02-03 16:21:51.619
3     2000-08-15 19:56:53.226
4     1998-10-19 21:23:18.210
5     1999-03-03 04:54:45.499
6     2003-03-03 20:56:32.107
7     2002-06-15 01:31:20.333
8     2004-12-01 22:34:38.763
9     2010-01-29 17:04:58.812
10    2009-10-12 18:33:52.352
11    2010-03-18 19:11:28.127
12    2010-02-04 19:10:31.894
13    2010-03-18 19:16:05.775
14    2010-03-18 19:19:52.115
15    2010-03-18 19:01:57.985
16    2005-11-17 14:11:26.743
17    2009-12-21 17:57:49.760
18    2010-01-29 15:04:54.537
19    2009-07-15 15:38:45.718
20    2010-02-01 14:56:27.597
21    2010-02-04 16:14:28.874
22    2010-11-09 03:05:17.724
23    2003-08-18 19:07:37.036
24    2000-04-05 19:22:05.939
25    2003-03-04 22:02:04.072
26    2003-03-04 22:02:36.241
27    2003-08-18 19:00:40.451
28    2010-01-26 17:26:55.923
29    2010-02-04 19:14:52.915
                ...          
512   2015-08-01 06:00:59.797
513   2015-09-11 06:06:01.001
514   2015

In [None]:
def convert_epoch_to_datetime(time_in, epoch_factor = 1000000):
    # deal with possibility of a string
    try:
        pd.to_datetime(time_in*epoch_factor)

In [16]:
count_lines(cur,'auto_est_sum')

515341595

## Sample Query

In [None]:
cur.execute('select est_rgn_cd, count(*) from auto_est_team group by est_rgn_cd')
cur.fetchall()

In [17]:
def return_feature_value_counts(cursor, table_name, column_name):
    qstring = "select "+column_name+", count (*) from "+table_name +" group by "+column_name
    
    cursor.execute(qstring)
    value_counts = cursor.fetchall()
    return value_counts

In [18]:
return_feature_value_counts(cur,'auto_est_team','est_rgn_cd')

[('20', 142),
 ('11', 78),
 ('02', 51),
 ('05', 133),
 ('28', 60),
 ('16', 68),
 ('13', 61),
 ('19', 153),
 ('08', 297),
 ('27', 160),
 ('22', 153),
 ('21', 78),
 ('10', 29),
 ('24', 98),
 ('03', 23),
 ('15', 60),
 ('06', 244),
 ('18', 70),
 ('01', 166),
 ('12', 53),
 ('04', 46),
 ('09', 59),
 ('17', 72),
 ('25', 224),
 ('23', 92),
 ('26', 195),
 ('14', 118),
 ('07', 152)]

## Close out a connection

In [None]:
cur.close()

#### following https://github.com/cloudera/impyla/tree/master/examples/logr
## 2. create some fake data for classification

In [None]:
import numpy as np
import sklearn.preprocessing
rows=10000
cols=2

In [None]:
class0 = np.random.multivariate_normal([2,2], np.diag([1,1]),rows/2)
class1 = np.random.multivariate_normal([-2,-2], np.diag([1,1]),rows - rows/2)

In [None]:
data = np.vstack((np.hstack((class0, np.zeros((rows / 2, 1)))),
                  np.hstack((class1, np.ones((rows - rows / 2, 1))))))
data = data[np.random.permutation(rows)]
scaled_obs = sklearn.preprocessing.StandardScaler().fit_transform(data[:, :2])
data = np.hstack((scaled_obs, data[:, 2].reshape(rows, 1)))

perform in-memory logistic regression with scikit-learn

In [None]:
import sklearn.linear_model
inmemory_estimator = sklearn.linear_model.LogisticRegression(fit_intercept=False)
inmemory_estimator.fit(data[:, :cols], data[:, cols])

#3. Push the data into Impala


In [None]:
# create the table to hold the data
cur.execute("CREATE TABLE test_logr (%s, label BOOLEAN)" % ', '.join(['feat%i DOUBLE' % i for i in xrange(cols)]))
# push the data to Impala with INSERT statements in batches of 1000 rows
data_strings = []
for i in xrange(rows):
    row_string = '(' + ', '.join([str(val) for val in data[i, :-1]]) + ', %s' % ('true' if data[i, -1] > 0 else 'false') + ')'
    data_strings.append(row_string)
    if (i + 1) % 1000 == 0:
        data_query = 'INSERT INTO test_logr VALUES %s' % ', '.join(data_strings)
        cursor.execute(data_query)
        data_strings = []

## 3b. Let me try to create the table from a dataset already in HDFS

In [None]:
!hdfs dfs -ls /user/kesj/sample_data

In [None]:
my_infile = '/user/kesj/sample_data/impyla_census_raw.csv'

In [None]:
create_table_query = """
    CREATE EXTERNAL TABLE IF NOT EXISTS census_text (age INT, workclass STRING,
            final_weight INT, education STRING, education_num INT,
            marital_status STRING, occupation STRING, relationship STRING,
            race STRING, sex STRING, hours_per_week INT, native_country STRING,
            income STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS TEXTFILE
    LOCATION '/user/kesj/sample_data/impyla_census_raw.csv'
"""

In [None]:
cur.execute(create_table_query)

In [None]:
from impala.util import as_pandas
#cur.execute('SELECT * FROM y1753caale')
#cur.execute('select * FROM p3533eeb_detl')
cur.execute('select * FROM y1753caaestparty')
le = as_pandas(cur)
print len(le)


In [None]:
le.head()

In [None]:
#sq1 = "'select * from "+myTables[1][0]+"'"
#cur.execute(sq1)
cur.execute('select * from y1753caadetl')
detl = as_pandas(cur)

In [None]:
print shape(detl)

In [None]:
len(le.groupby('los_est_dim_id'))

In [None]:
le.los_est_dim_id.head()

In [None]:
ledi0 = 85051447
sum(detl['los_est_dim_id']==ledi0)
#detl[['los_est_dim_id','est_ver_num','prt_clas_cd']]

In [None]:
detl[detl['los_est_dim_id']==ledi0][['est_ver_num','oem_prt_num','line_desc_txt','prt_clas_cd','prt_qty_cnt','price_amt','ver_num',]]

In [None]:
print len(le[le['los_est_dim_id']==ledi0])
le[le['los_est_dim_id']==ledi0][['vndr_veh_cd','los_tstmp','prmry_poi_cd','scdy_poi_cd','fdw_rplc_tstmp']]

In [None]:
le.columns.values

In [None]:
len(detl.prt_clas_cd.unique())

In [None]:
y17533caaPRTcounts = detl.prt_clas_cd.value_counts()
y17533caaPRTcounts.tail(-1).hist(color='steelblue',bins=50)

In [None]:
detl[detl.prt_clas_cd == '88JHR']['line_desc_txt']

In [None]:
le[le.los_est_dim_id == ledi0]['vndr_cd']

In [None]:
cur.execute("CREATE external TABLE page_view(viewTime INT, userid BIGINT,\
                page_url STRING, referrer_url STRING,\
                ip STRING COMMENT 'IP Address of the User')\
COMMENT 'This is the page view table'\
PARTITIONED BY(dt STRING, country STRING)\
CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS\
ROW FORMAT DELIMITED\
        FIELDS TERMINATED BY '1'\
        COLLECTION ITEMS TERMINATED BY '2'\
        MAP KEYS TERMINATED BY '3'\
STORED AS SEQUENCEFILE;")

In [None]:
from pandas import *
from StringIO import StringIO

s = "sshpass -f myfilewithpassword ssh myusername@myhostname \"hive -S -e \\\"" \
"set hive.cli.print.header=true;" \
"SELECT * from mytable;\\\"\""

t = !$s
df = read_csv(StringIO(t.n), sep='\t')

In [None]:
from pandas import *
from StringIO import StringIO

s = """
sshpass -f myfilewithpassword ssh myusername@myhostname \"
hive -S -e \\\"
set hive.cli.print.header=true;
SELECT * from mytable;
\\\"\"
"""

t = !$s
df = read_csv(StringIO(t.n), sep='\t'

In [None]:
hdfsfiles = !hdfs dfs -ls 

In [None]:
hdfsfiles