TODO: 
- How to do feature selection? Recursive elimination, L1 regularization, tree model.
- Use pyspark. Does it come with feature selection modules?
    The tree model implements feature importance.
    The logistic regression model can use L1 regularization.
    Doesn't do recursive elimination.
    Does chi-squared test.

- Metrics.
    + Use log-loss to evaluate.
    + precision, recall, F2
    + accuracy (ratio of correct)
- Models.
    Logistic regression, svm, decision tree, knn, naive bayes.
    All except logistic regression produces binary outputs.
- Target = 'click'. Not present in the test data set.

In [1]:
import gzip
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [2]:
df = pd.read_csv('data/train_small.csv')
df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,11761126300016035614,0,14102606,1002,0,6b175d24,a1b0ebd0,50e219e0,ecad2386,7801e8d9,...,0,0,15703,320,50,1722,0,35,-1,79
1,9454951464027616243,0,14102606,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15701,320,50,1722,0,35,-1,79
2,7597529435648422906,0,14102407,1005,0,85f751fd,c4e18dd6,50e219e0,92f5800b,ae637522,...,1,2,21191,320,50,2424,1,161,100193,71
3,17674252474717503445,0,14102616,1005,0,2b1ddb24,98acf46c,3e814130,ecad2386,7801e8d9,...,1,0,17239,320,50,1973,3,39,100148,23
4,14668325104229564251,0,14102504,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15705,320,50,1722,0,35,100083,79


In [5]:
# anonymized columns are already indexed.
anonymized = ['C1'] + ['C' + str(k) for k in range(14,22)]
df[anonymized].head()

Unnamed: 0,C1,C14,C15,C16,C17,C18,C19,C20,C21
0,1002,15703,320,50,1722,0,35,-1,79
1,1005,15701,320,50,1722,0,35,-1,79
2,1005,21191,320,50,2424,1,161,100193,71
3,1005,17239,320,50,1973,3,39,100148,23
4,1005,15705,320,50,1722,0,35,100083,79


In [8]:
# these columns need indexing
str_cols = ['site_id','site_domain','site_category','app_id','app_domain','app_category','device_id','device_ip','device_model']
df[str_cols].head()

Unnamed: 0,site_id,site_domain,site_category,app_id,app_domain,app_category,device_id,device_ip,device_model
0,6b175d24,a1b0ebd0,50e219e0,ecad2386,7801e8d9,07d7df22,54eeaef3,0d34e1f4,52c5cc40
1,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,fc61627b,711ee120
2,85f751fd,c4e18dd6,50e219e0,92f5800b,ae637522,0f2161f8,a99f214a,68b6ba25,981edffc
3,2b1ddb24,98acf46c,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,75092a71,ecb851b2
4,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,5bb6c0b5,8a4875bd


In [9]:
# the rest
df.drop(anonymized, axis=1).drop(str_cols, axis=1).head()

Unnamed: 0,id,click,hour,banner_pos,device_type,device_conn_type
0,11761126300016035614,0,14102606,0,0,0
1,9454951464027616243,0,14102606,0,1,0
2,7597529435648422906,0,14102407,0,1,2
3,17674252474717503445,0,14102616,0,1,0
4,14668325104229564251,0,14102504,0,1,0


In [28]:
# hour formatting
pd.to_datetime(df.hour.head(), format="%y%m%d%H")

0   2014-10-26 06:00:00
1   2014-10-26 06:00:00
2   2014-10-24 07:00:00
3   2014-10-26 16:00:00
4   2014-10-25 04:00:00
Name: hour, dtype: datetime64[ns]

In [17]:
# id shouldn't be included in the features
# Half of the rows have a unique device ip.
unique_vals = dict()
for c in df:
    unique_vals[c] = df[c].nunique()
sorted(unique_vals.items(), key=lambda kv: kv[1])

[('click', 2),
 ('device_type', 4),
 ('device_conn_type', 4),
 ('C18', 4),
 ('C1', 7),
 ('banner_pos', 7),
 ('C15', 8),
 ('C16', 9),
 ('site_category', 22),
 ('app_category', 27),
 ('C21', 60),
 ('C19', 66),
 ('C20', 164),
 ('app_domain', 206),
 ('hour', 240),
 ('C17', 421),
 ('C14', 2254),
 ('site_id', 2667),
 ('site_domain', 2875),
 ('app_id', 3147),
 ('device_model', 5212),
 ('device_id', 150398),
 ('device_ip', 554960),
 ('id', 1000000)]

In [9]:
df.shape

(1000000, 24)

In [16]:
df.describe()

Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,9.220259e+18,0.169773,14102560.0,1004.965138,0.287825,1.013995,0.331069,18838.292437,318.886996,59.991822,2112.033682,1.434757,227.270994,53287.068019,83.317616
std,5.324295e+18,0.375433,296.7422,1.092803,0.504222,0.524993,0.854595,4967.15334,21.257371,46.927257,610.375812,1.326402,351.148578,49952.338638,70.310016
min,13386410000000.0,0.0,14102100.0,1001.0,0.0,0.0,0.0,375.0,120.0,20.0,112.0,0.0,33.0,-1.0,1.0
25%,4.618979e+18,0.0,14102300.0,1005.0,0.0,1.0,0.0,16920.0,320.0,50.0,1863.0,0.0,35.0,-1.0,23.0
50%,9.206212e+18,0.0,14102600.0,1005.0,0.0,1.0,0.0,20346.0,320.0,50.0,2323.0,2.0,39.0,100050.0,61.0
75%,1.383418e+19,0.0,14102810.0,1005.0,1.0,1.0,0.0,21894.0,320.0,50.0,2526.0,3.0,171.0,100094.0,101.0
max,1.844674e+19,1.0,14103020.0,1012.0,7.0,5.0,5.0,24044.0,1024.0,1024.0,2757.0,3.0,1839.0,100248.0,255.0


In [11]:
# 183MB?
df.memory_usage().sum()/(1024*1024)

183.1055450439453

In [6]:
df.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

Index(['hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
       'app_id', 'app_domain', 'app_category', 'device_id', 'device_ip',
       'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16',
       'C17', 'C18', 'C19', 'C20', 'C21'],
      dtype='object')

In [13]:
# slightly under 0.175, which is the mean in the full dataset
df.click.mean() 

0.169773

Compute the grouped mean for each variable

In [15]:
df.groupby('device_type')['click'].mean()

device_type
0    0.212078
1    0.168955
4    0.097977
5    0.097419
Name: click, dtype: float64

## Clean features
Index string columns.

## Logistic regression

In [19]:
# consider:
# - class_weight='balanced'
# - solver='sag'
# - n_jobs=2
train_cols = df.columns.drop('click')
X = df[df.columns.drop(['id','click'])].values
y = df.click.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(1e5))
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=int(1e5))
clf = LogisticRegression(random_state=0,
                        solver='liblinear',
                        class_weight=None,
                        )

In [20]:
%%time
clf.fit(X_train, y_train)

ValueError: could not convert string to float: '1aa0e912'

In [None]:
clf.score(X_train, y_train), clf.score(X_dev, y_dev)