In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from time import time

In [3]:
%matplotlib inline

In [4]:
# Read data, use your own path
df = pd.read_csv("./data/data.csv")
df_quiz = pd.read_csv("./data/quiz.csv")

In [5]:
# Data shape
print (df.shape, df_quiz.shape)

(126837, 53) (31709, 52)


In [6]:
# Separate labels from input vectors
df_y = df['label'].copy()
del df['label']

In [7]:
# Delete columns that have no variance (all 0's or all 1's)
cols_to_delete = ['18','25','29', '31', '32', '35', '23', '26', '58']
for col in cols_to_delete:
    del df[col]
    del df_quiz[col]

In [8]:
# Get list of remaining columns
df_cols = list(df.columns.values)
print (len(df_cols))

43


In [9]:
# Normalize the continuous columns
df['59'] = (df['59'] - df['59'].mean()) /  (df['59'].max() - df['59'].min())
df['60'] = (df['60'] - df['60'].mean()) /  (df['60'].max() - df['60'].min())

df_quiz['59'] = (df_quiz['59'] - df_quiz['59'].mean()) /  (df_quiz['59'].max() - df_quiz['59'].min())
df_quiz['60'] = (df_quiz['60'] - df_quiz['60'].mean()) /  (df_quiz['60'].max() - df_quiz['60'].min())


In [10]:
# Define categorical columns
categorical_cols = ['56', '20', '14', '17', '16', '57', '0', '5', '7', '9', '8']
num_to_categorical_cols = [str(i) for i in range(38,52)] # numerical columns with [0,1,2] vals, also converting
categorical_cols_enhanced = categorical_cols + num_to_categorical_cols  

In [11]:
# Convert categorical to one-hot sparse column
df_one_hot = pd.get_dummies(df, columns=categorical_cols_enhanced)

In [12]:
df_one_hot.shape

(126837, 545)

In [13]:
df_one_hot.columns

Index(['2', '11', '27', '28', '30', '33', '34', '36', '37', '52',
       ...
       '48_2.0', '49_0.0', '49_1.0', '49_2.0', '50_0.0', '50_1.0', '50_2.0',
       '51_0.0', '51_1.0', '51_2.0'],
      dtype='object', length=545)

## OneHot Encoding Quiz Data

In [14]:
df_one_hot_quiz = pd.get_dummies(df_quiz, columns=categorical_cols_enhanced)

col_to_add = np.setdiff1d(df_one_hot.columns, df_one_hot_quiz.columns)
for c in col_to_add:
    df_one_hot_quiz[c] = 0

df_one_hot_quiz = df_one_hot_quiz[df_one_hot.columns]

In [16]:
df_one_hot_quiz.shape

(31709, 545)

In [17]:
df_one_hot_quiz.describe()

Unnamed: 0,2,11,27,28,30,33,34,36,37,52,...,48_2.0,49_0.0,49_1.0,49_2.0,50_0.0,50_1.0,50_2.0,51_0.0,51_1.0,51_2.0
count,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,...,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0,31709.0
mean,0.003721,0.002491,0.258444,0.32259,0.993913,0.099026,0.993913,0.596424,0.184743,0.00596,...,0.02709,0.963764,0.035573,0.000662,0.993251,0.006654,9.5e-05,0.821155,0.169731,0.009114
std,0.06089,0.049853,0.437786,0.467475,0.07778,0.298701,0.07778,0.490622,0.388095,0.076975,...,0.162349,0.186879,0.185227,0.025727,0.081875,0.081303,0.009726,0.383229,0.375402,0.095033
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
logistic = LogisticRegression(penalty='l2', C=1000, 
                              fit_intercept=True, intercept_scaling=1, 
                              class_weight='balanced',
                              tol=0.0001,
                              n_jobs=-1,
                              verbose=1)

In [20]:
logistic.fit(df_one_hot, df_y)

[LibLinear]

LogisticRegression(C=1000, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=1, warm_start=False)

In [21]:
logistic.score(df_one_hot, df_y)

0.88806893887430327

In [22]:
y_logit_pred = logistic.predict(df_one_hot_quiz)

In [23]:
len(y_logit_pred)

31709

In [None]:
submission_ver = "007"
path = '/Users/tonatiuh/fmcode/dsi/ml/kaggle/data/'
submission_file = 'submission_' + submission_ver + '.csv'
f_out = open(path+submission_file, 'w')
f_out.write("Id,Prediction\n")
for i in range(1,len(y_logit_pred)+1):
    f_out.write(str(i)+','+str(y_logit_pred[i-1])+'\n')
f_out.close()