# Working in Impala
## Step 1 is to connect to the correct host
Using impyla seems to mimics the impala-shell (CLI) pretty closely.

Some important points to keep in mind/equivalencies

* `impala-shell -i host:port` is equivalent to `connect(host=hostname,port=portnumber)`
* `cursor = connect(...).cursor()` yields access to the CLI via:
    - `cursor.execute(query)` followed by
    - `cursor.fetchall()` to return the results
* note that the queries don't have to end in a semicolon.    


In [1]:
# connect to the correct host
myhostname = 'ac00h1pdata02.opr.statefarm.org' #hostname of G-building
myhostname = '
portnum = 21050 # default port number ??

In [1]:
# connect to the correct host
myhostname = 'da74wbdn02.opr.statefarm.org' #hostname of PHDac00h1pdata02.opr.statefarm.org' #hostname of G-building
#myhostname = '' 
#you may use any of the datanodes from da74wbdn01 to da74wbdn27
portnum = 21050 # default port number ??

In [3]:
from impala.dbapi import connect
#conn = connect(host=myhostname, port=portnum,use_kerberos=True)
conn = connect(host=myhostname, port=portnum,auth_mechanism="GSSAPI")
cur = conn.cursor()

In [4]:
cur.execute('SHOW databases')
cur.fetchall()

[('a224_db',),
 ('atlasid',),
 ('claiment',),
 ('claimsff',),
 ('d0xy_db',),
 ('dc7n_db',),
 ('dddh_db',),
 ('default',),
 ('du3z_db',),
 ('fdwcfpoc',),
 ('fm61_db',),
 ('hrsepara',),
 ('ke63_db',),
 ('kesj_db',),
 ('kesq_db',),
 ('kjmw_db',),
 ('kk1k_db',),
 ('knqr_db',),
 ('krsw_db',),
 ('lapsecan',),
 ('mkir_db',),
 ('qepz_db',),
 ('rm6k_db',),
 ('sqqq_db',),
 ('totaloss',),
 ('umwd_db',),
 ('vehrepat',)]

In [4]:
#cur.fetchall()

[('default',), ('kesj_db',), ('lapsecan',), ('vehrepat',)]

## Some utility functions 
1. list databases
    * type `cursor.execute("use dbasename")` to change to database
2. list tables
3. count lines in a table

In [5]:
# list databases
def listDatabases(cursor):
    """
    input: the cursor connection input
    output: list of tables in active database
    """
    cursor.execute('SHOW databases')
    dbList = cursor.fetchall()
    return dbList

# list tables
def listTables(cursor):
    """
    input: the cursor connection input
    output: list of tables in active database
    """
    cursor.execute('SHOW tables')
    tblList = cursor.fetchall()
    return tblList
# count the number of lines in a given table
def countLines(cursor,tablename):
    qstring = "'select count(*) from "+tablename+"'"
    return cursor.execute(qstring)

In [9]:
cur.close()

In [15]:
# list the databases
listDatabases(cur)

[('default',), ('lapsecan',), ('vehrepat',)]

In [6]:
# change to desired database
#cur.execute("use vehrepat")
cur.execute("use totaloss")

In [8]:
myTables=listTables(cur)
print(len(myTables))
myTables

26


[('cclaims',),
 ('clm',),
 ('clm05',),
 ('clm_test',),
 ('clm_veh',),
 ('clm_veh1',),
 ('clmveh1',),
 ('cmplt_task',),
 ('cmplt_task2',),
 ('completed_task_long',),
 ('completed_task_small',),
 ('completed_task_txt',),
 ('dclaims',),
 ('dtl_taskdtl',),
 ('ecs_orgzn',),
 ('ecs_orgzn_str_flat_cchd',),
 ('rpr_stts',),
 ('task',),
 ('task2',),
 ('taskclaimkey2',),
 ('taskdtl',),
 ('taskdtl05',),
 ('taskkey_task05',),
 ('tl_taskdtl',),
 ('tl_taskdtl05',),
 ('ttl_los_adj6b',)]

In [9]:
cur.execute("use sr_grids")

HiveServer2Error: AuthorizationException: User 'kesj@OPR.STATEFARM.ORG' does not have privileges to access: sr_grids.*


## See if I can create a new table from within notebook

In [18]:
cur.execute("use default")

#### following https://github.com/cloudera/impyla/tree/master/examples/logr
## 2. create some fake data for classification

In [19]:
import numpy as np
import sklearn.preprocessing
rows=10000
cols=2

In [20]:
class0 = np.random.multivariate_normal([2,2], np.diag([1,1]),rows/2)
class1 = np.random.multivariate_normal([-2,-2], np.diag([1,1]),rows - rows/2)

In [21]:
data = np.vstack((np.hstack((class0, np.zeros((rows / 2, 1)))),
                  np.hstack((class1, np.ones((rows - rows / 2, 1))))))
data = data[np.random.permutation(rows)]
scaled_obs = sklearn.preprocessing.StandardScaler().fit_transform(data[:, :2])
data = np.hstack((scaled_obs, data[:, 2].reshape(rows, 1)))

perform in-memory logistic regression with scikit-learn

In [22]:
import sklearn.linear_model
inmemory_estimator = sklearn.linear_model.LogisticRegression(fit_intercept=False)
inmemory_estimator.fit(data[:, :cols], data[:, cols])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

#3. Push the data into Impala


In [24]:
# create the table to hold the data
cur.execute("CREATE TABLE test_logr (%s, label BOOLEAN)" % ', '.join(['feat%i DOUBLE' % i for i in xrange(cols)]))
# push the data to Impala with INSERT statements in batches of 1000 rows
data_strings = []
for i in xrange(rows):
    row_string = '(' + ', '.join([str(val) for val in data[i, :-1]]) + ', %s' % ('true' if data[i, -1] > 0 else 'false') + ')'
    data_strings.append(row_string)
    if (i + 1) % 1000 == 0:
        data_query = 'INSERT INTO test_logr VALUES %s' % ', '.join(data_strings)
        cursor.execute(data_query)
        data_strings = []

HiveServer2Error: AuthorizationException: User 'kesj@OPR.STATEFARM.ORG' does not have privileges to execute 'CREATE' on: default.test_logr

## 3b. Let me try to create the table from a dataset already in HDFS

In [25]:
!hdfs dfs -ls /user/kesj/sample_data

Found 9 items
-rw-rwx---+  3 kesj kesj    1236995 2015-01-14 12:10 /user/kesj/sample_data/2701.txt
-rw-rwx---+  3 kesj kesj       9130 2015-01-14 12:10 /user/kesj/sample_data/DETL.avsc
-rw-rw----+  3 kesj kesj    3370708 2015-03-18 14:03 /user/kesj/sample_data/impyla_census_raw.csv
-rw-rwx---+  3 kesj kesj 1698410496 2015-01-14 12:13 /user/kesj/sample_data/part-m-00000.avro
drwxrwx---+  - kesj kesj          0 2014-12-12 15:10 /user/kesj/sample_data/pytanic
drwxrwxr-x+  - kesj kesj          0 2015-01-14 12:13 /user/kesj/sample_data/tab1
drwxrwxr-x+  - kesj kesj          0 2015-01-14 12:13 /user/kesj/sample_data/tab2
-rw-rwx---+  3 kesj kesj        112 2015-01-14 12:13 /user/kesj/sample_data/tiny_graph.txt
drwxrwxr-x+  - kesj kesj          0 2014-12-11 15:44 /user/kesj/sample_data/titanic


In [26]:
my_infile = '/user/kesj/sample_data/impyla_census_raw.csv'

In [27]:
create_table_query = """
    CREATE EXTERNAL TABLE IF NOT EXISTS census_text (age INT, workclass STRING,
            final_weight INT, education STRING, education_num INT,
            marital_status STRING, occupation STRING, relationship STRING,
            race STRING, sex STRING, hours_per_week INT, native_country STRING,
            income STRING)
    ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
    STORED AS TEXTFILE
    LOCATION '/user/kesj/sample_data/impyla_census_raw.csv'
"""

In [29]:
cur.execute(create_table_query)

HiveServer2Error: AuthorizationException: User 'kesj@OPR.STATEFARM.ORG' does not have privileges to execute 'CREATE' on: default.census_text

In [24]:
from impala.util import as_pandas
#cur.execute('SELECT * FROM y1753caale')
#cur.execute('select * FROM p3533eeb_detl')
cur.execute('select * FROM y1753caaestparty')
le = as_pandas(cur)
print len(le)


0


In [25]:
le.head()

Unnamed: 0,est_party_dim_id,last_nm,first_nm,extnl_party_id,addr_line_one_txt,addr_line_two_txt,city_nm,st_prvnc_cd,zip_cd,cntry_cd,...,extnl_vndr_id_type_cd,clm_id,data_cntxt_cd,fdw_rplc_ind,fdw_insrt_tstmp,fdw_rplc_tstmp,est_ver_num,src_insrt_tstmp,los_est_busn_id,los_est_dim_id


In [None]:
#sq1 = "'select * from "+myTables[1][0]+"'"
#cur.execute(sq1)
cur.execute('select * from y1753caadetl')
detl = as_pandas(cur)

In [None]:
print shape(detl)

In [None]:
len(le.groupby('los_est_dim_id'))

In [None]:
le.los_est_dim_id.head()

In [None]:
ledi0 = 85051447
sum(detl['los_est_dim_id']==ledi0)
#detl[['los_est_dim_id','est_ver_num','prt_clas_cd']]

In [None]:
detl[detl['los_est_dim_id']==ledi0][['est_ver_num','oem_prt_num','line_desc_txt','prt_clas_cd','prt_qty_cnt','price_amt','ver_num',]]

In [None]:
print len(le[le['los_est_dim_id']==ledi0])
le[le['los_est_dim_id']==ledi0][['vndr_veh_cd','los_tstmp','prmry_poi_cd','scdy_poi_cd','fdw_rplc_tstmp']]

In [None]:
le.columns.values

In [None]:
len(detl.prt_clas_cd.unique())

In [None]:
y17533caaPRTcounts = detl.prt_clas_cd.value_counts()
y17533caaPRTcounts.tail(-1).hist(color='steelblue',bins=50)

In [None]:
detl[detl.prt_clas_cd == '88JHR']['line_desc_txt']

In [None]:
le[le.los_est_dim_id == ledi0]['vndr_cd']

In [14]:
cur.execute("CREATE external TABLE page_view(viewTime INT, userid BIGINT,\
                page_url STRING, referrer_url STRING,\
                ip STRING COMMENT 'IP Address of the User')\
COMMENT 'This is the page view table'\
PARTITIONED BY(dt STRING, country STRING)\
CLUSTERED BY(userid) SORTED BY(viewTime) INTO 32 BUCKETS\
ROW FORMAT DELIMITED\
        FIELDS TERMINATED BY '1'\
        COLLECTION ITEMS TERMINATED BY '2'\
        MAP KEYS TERMINATED BY '3'\
STORED AS SEQUENCEFILE;")

RPCError: RPC status error: TExecuteStatementResp: TStatus(errorCode=None, errorMessage="AnalysisException: Syntax error in line 1:\n...is is the page view table'PARTITIONED BY(dt STRING, co...\n                             ^\nEncountered: PARTITIONED\nExpected: AS, LOCATION, ROW, STORED, TBLPROPERTIES, WITH, COMMA\n\nCAUSED BY: Exception: Syntax error", sqlState='HY000', infoMessages=None, statusCode=3)

In [None]:
from pandas import *
from StringIO import StringIO

s = "sshpass -f myfilewithpassword ssh myusername@myhostname \"hive -S -e \\\"" \
"set hive.cli.print.header=true;" \
"SELECT * from mytable;\\\"\""

t = !$s
df = read_csv(StringIO(t.n), sep='\t')

In [None]:
from pandas import *
from StringIO import StringIO

s = """
sshpass -f myfilewithpassword ssh myusername@myhostname \"
hive -S -e \\\"
set hive.cli.print.header=true;
SELECT * from mytable;
\\\"\"
"""

t = !$s
df = read_csv(StringIO(t.n), sep='\t'

In [16]:
hdfsfiles = !hadoop fs -ls 