# Train level 0: Logistic model
- Optimize linear models for training on brand / device
- V1: Select best data for model (onehotencoded both, separate, or just one)
    - Brands onehot, device label: 2.40215702862 (five seeds: 2.4017680305744418)
    - Seperate onehot encoded:  2.39089229204 (five seeds: 2.3901993860642716)
    - Combined onehot encoded: 2.39583615824 (five seeds: 2.3952347531028977)
- V2: Used couple of creative features based on brand and device model, also included brand when encoded device
    - Without scaler: 2.39122949113 (five seeds: 2.3904997299834445)
    - With scaler: 2.39075599796 (five seeds: 2.3899157753336815)
- V3: Used a new CV standard used by all models
    - CV Score: LB Score: 

In [40]:
import random
import os
import pickle
import datetime


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

%matplotlib inline


In [10]:
data_dir = './data_ori/'
feat_dir = './data/'
sub_dir = './model_0_logistic'
use_scaler = False

In [11]:
def open_feature_file(fname, samples='train'):
    if fname[-3:] == 'csv':
        if samples=='train':
            X = gatrain[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), on='device_id', how='left')
        else:
            X = gatest[['device_id']].merge( pd.read_csv(os.path.join(feat_dir, fname)), on='device_id', how='left')
            
        X.drop('device_id', axis=1, inplace=True)
        X.fillna(0, inplace=True)
        
        if use_scaler:
            for c in X.columns:
                if X[c].max()>1:
                    X[c] = MinMaxScaler().fit_transform(X)
            
        #print X.shape
        return csr_matrix(X.values)
    else:
        # Assume it is a pickle file
        with open(os.path.join(feat_dir, '{}_{}.pickle'.format(fname,samples)), 'rb') as f:
            return pickle.load(f)
            
feature_files = ['features_brand_bag',
                 'features_brand_model_bag',
                 'features_brand_model.csv']

### Loading and preparing data

In [12]:
gatrain = pd.read_csv('./data_ori/gender_age_train.csv')
gatest = pd.read_csv('./data_ori/gender_age_test.csv')
#train = pd.merge(gatrain, brand, on='device_id', how='inner')

In [13]:
Xtrain = hstack([open_feature_file(f) for f in feature_files], format='csr')
Xtest = hstack([open_feature_file(f,'test') for f in feature_files], format='csr')
y = gatrain['group']

In [14]:
print 'X', Xtrain.shape
print 'y', y.shape
#X.head()

X (74645, 1803)
y (74645,)


In [15]:
letarget = LabelEncoder().fit(y)
y = letarget.transform(y)
n_classes = len(letarget.classes_)

In [17]:
# Load CV sets
train_cv = pd.read_csv(os.path.join(data_dir, 'gender_age_train_cv.csv'))
test_cv = pd.read_csv(os.path.join(data_dir, 'gender_age_test_cv.csv'))

X_train, X_val = Xtrain[train_cv.sample_nr.values, :], Xtrain[test_cv.sample_nr.values, :]
y_train, y_val = y[train_cv.sample_nr], y[test_cv.sample_nr]

## Linear regresion

In [7]:
# Hyperparameter optimizatio
rs = 123
kf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=rs)

log = LogisticRegression()
param_grid = {'C': np.linspace(0.07, 0.20, 10), 'penalty': ['l2']}
clf = GridSearchCV(log, param_grid, scoring='log_loss', n_jobs=5, cv=kf, verbose=10)
clf.fit(Xtrain, y)
print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_))
print "With parameters:"
    
best_parameters = clf.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name]) 

NameError: name 'StratifiedKFold' is not defined

In [5]:
params = {'type': 'LogisticRegression',
         'C': 0.13,
         'penalty': 'l2'}

In [72]:
scores = {}
models = {}
for s in [0, 12, 123, 1234, 12345]:
    kf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=s)
    pred_l = np.zeros((Xtrain.shape[0],n_classes))
    c=1
    for itrain, itest in kf:
        print('%d / %d' % (c, 10))
        ytrain, ytest = y[itrain], y[itest]
        xg_train = Xtrain[itrain, :]
        xg_test = Xtrain[itest, :]
        clf = LogisticRegression(C=0.13, penalty='l2')
        clf.fit(xg_train, ytrain)
        pred_l[itest,:] = clf.predict_proba(xg_test)
        c+=1
    print log_loss(y, pred_l)
    scores[s] = pred_l

1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39027660069
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39035856141
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39075599796
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39049377478
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
2.39079749726


In [73]:
log_loss(y,sum(scores.values()))

2.3899157753336815

In [23]:
# 2.39050541479
0.1
2.39059357843
0.115
2.39044702338
0.13
2.39044648731
0.145
2.39028745599
0.16
2.39015495289

In [None]:
n_models = 5

scores = []
models = []
preds = []
for c in np.linspace(0.13, 0.19, n_models):
    print c
    
    params = {'type': 'LogisticRegression',
              'C': c,
              'penalty': 'l2',
              'solver': 'lbfgs',
              'max_iter': 100000,
              'warm_start': False}
    
    clf = LogisticRegression(C=c, 
                             penalty='l2', 
                             max_iter=100000, 
                             warm_start=False,
                             solver='lbfgs')
    clf.fit(X_train, y_train)
    
    clf = LogisticRegression(C=c, 
                             penalty='l2', 
                             max_iter=100000, 
                             warm_start=True,
                             solver='lbfgs')
    clf.fit(X_train, y_train)
    
    pred = clf.predict_proba(X_val)
    score = log_loss(y_val, pred)
    print score
    
    model_out = {'model': clf,
                 'score': score,
                 'params': params}
    
    models.append(model_out)
    scores.append(score)
    preds.append(pred)

0.13
2.39044648731
0.145
2.39028745599
0.16
2.39015495289
0.175


In [37]:
cv_score = log_loss(y_val, sum(preds)/n_models)
print('CV score: {:.4f}'.format(cv_score))

CV score: 2.3903


## Store models

In [44]:
outputfile = 'models_logistic_0_V3_{}_{:.4f}_{:.4f}.pickle'.format(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"),
                                                                cv_score,
                                                                -1.)

output = {'script': 'train_model_0_logistic',
          'model_params': params,
          'no_models': 5,
          'cross_validation': {'type': 'gender_age_train_cv.csv'},
          'models': models}


with open(os.path.join(sub_dir, outputfile), 'wb') as f:
    pickle.dump(output,f)

### HyperOpt Search