In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from time import time

In [2]:
%matplotlib inline



In [3]:
# Read data, use your own path
df = pd.read_csv("../data/data.csv")
df_quiz = pd.read_csv("../data/quiz.csv")

In [4]:
# Data shape
print df.shape, df_quiz.shape

(126837, 53) (31709, 52)


In [5]:
# Separate labels from input vectors
df_y = df['label'].copy()
del df['label']

In [6]:
# Delete columns that have no variance (all 0's or all 1's)
cols_to_delete = ['18','25','29', '31', '32', '35', '23', '26', '58']
for col in cols_to_delete:
    del df[col]
    del df_quiz[col]

In [7]:
# Get list of remaining columns
df_cols = list(df.columns.values)
print len(df_cols)

43


In [8]:
# Holds the unique values for each column, will be used later on for sparsifying the quiz data
df_cols_dict = {}

In [9]:
# Print the number of values each column takes and populate df_cols_dict
for col in df_cols:
    print "*****column: ", col
    print pd.value_counts(df[col].values)
    df_cols_dict[col] = pd.value_counts(df[col].values).to_dict().keys()

*****column:  0
def        59038
indef      25712
null       20714
pro        10182
dctc        3589
poss        2751
el          2037
dem         1453
numpro       564
num          435
relpro       175
demnum        94
posspro       93
dtype: int64
*****column:  2
0    126297
1       540
dtype: int64
*****column:  5
dobj                    36616
prep_of                 14731
nsubj                   12404
root                     8131
pobj                     6863
prep_to                  4590
nn                       3789
dep                      3766
prep_above               3447
prep_below               3044
prep_from                2278
conj_and                 2262
prep_underneath          2262
prep_towards             1992
prep_at                  1313
ccomp                    1236
prep_between             1129
prep_with                1074
prep_past                1070
prep_in                  1042
prep_under                927
advmod                    890
xcomp                

In [10]:
# Normalize the continuous columns
df['59'] = (df['59'] - df['59'].mean()) /  (df['59'].max() - df['59'].min())
df['60'] = (df['60'] - df['60'].mean()) /  (df['60'].max() - df['60'].min())

df_quiz['59'] = (df_quiz['59'] - df_quiz['59'].mean()) /  (df_quiz['59'].max() - df_quiz['59'].min())
df_quiz['60'] = (df_quiz['60'] - df_quiz['60'].mean()) /  (df_quiz['60'].max() - df_quiz['60'].min())


In [11]:
# Define categorical columns
categorical_cols = ['56', '20', '14', '17', '16', '57', '0', '5', '7', '9', '8']
num_to_categorical_cols = [str(i) for i in range(38,52)] # numerical columns with [0,1,2] vals, also converting
categorical_cols_enhanced = categorical_cols + num_to_categorical_cols  

In [12]:
# Convert categorical to one-hot sparse column
df_one_hot = pd.get_dummies(df, columns=categorical_cols_enhanced)

In [13]:
df_one_hot.shape

(126837, 545)

## Inflating quiz data

In [14]:
# list of dicts with inflated data, will be used to construct quiz dataframe
quiz_raw_data = []

# different column types, will be inflated differently 
cols_set = set(df_cols)
cols_categ_set = set(categorical_cols_enhanced)
cols_num_set = cols_set - cols_categ_set
cols_num_to_categ = set(num_to_categorical_cols)

# Sanity check
if len(set.union(cols_num_set,cols_categ_set)) != len(cols_set):
    raise RuntimeError

for i in range(len(df_quiz)):
    if i%1000 == 0:
        print "inflating row", i
    x = df_quiz.iloc[i].to_dict()
    x_inflated = {}
    for k,v in x.items():
        if k in cols_num_set:
            x_inflated[k] = v
        elif k in cols_num_to_categ:
            for k2 in ['0.0','1.0','2.0']:
                inflated_col = k + "_" + k2
                x_inflated[inflated_col] = 0
            x_inflated[k + "_" + str(v)] = 1
        else:
            for k2 in df_cols_dict[k]:
                inflated_col = k + "_" + k2
                if v == k2:
                    x_inflated[inflated_col] = 1.0 
                else:
                    x_inflated[inflated_col] = 0.0
                    
    quiz_raw_data.append(x_inflated)
            
print len(quiz_raw_data)    

inflating row 0
inflating row 1000
inflating row 2000
inflating row 3000
inflating row 4000
inflating row 5000
inflating row 6000
inflating row 7000
inflating row 8000
inflating row 9000
inflating row 10000
inflating row 11000
inflating row 12000
inflating row 13000
inflating row 14000
inflating row 15000
inflating row 16000
inflating row 17000
inflating row 18000
inflating row 19000
inflating row 20000
inflating row 21000
inflating row 22000
inflating row 23000
inflating row 24000
inflating row 25000
inflating row 26000
inflating row 27000
inflating row 28000
inflating row 29000
inflating row 30000
inflating row 31000
31709


In [15]:
df_cols_inflated = list(df_one_hot.columns.values)

In [16]:
#print df_cols_inflated

In [17]:
df_one_hot_quiz = pd.DataFrame(data=quiz_raw_data, columns=df_cols_inflated)

In [18]:
#df_one_hot_quiz.describe()

In [19]:
df_one_hot_quiz.to_csv('/Users/tonatiuh/fmcode/dsi/ml/kaggle/data/quiz_inflated.csv.gz', compression='gzip')

## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
logistic = LogisticRegression(penalty='l2', C=1000, 
                              fit_intercept=True, intercept_scaling=1, 
                              class_weight='balanced',
                              tol=0.0001,
                              n_jobs=-1,
                              verbose=1)

In [22]:
logistic.fit(df_one_hot, df_y)

[LibLinear]

LogisticRegression(C=1000, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [23]:
logistic.score(df_one_hot, df_y)

0.88806893887430327

In [24]:
y_logit_pred = logistic.predict(df_one_hot_quiz)

In [25]:
len(y_logit_pred)

31709

In [26]:
submission_ver = "007"
path = '/Users/tonatiuh/fmcode/dsi/ml/kaggle/data/'
submission_file = 'submission_' + submission_ver + '.csv'
f_out = open(path+submission_file, 'w')
f_out.write("Id,Prediction\n")
for i in range(1,len(y_logit_pred)+1):
    f_out.write(str(i)+','+str(y_logit_pred[i-1])+'\n')
f_out.close()