In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
from datetime import datetime, date, timedelta

import tqdm
import time



In [2]:
data_df = pd.read_csv('./assets/Kaggle Raw Data/train_users_2.csv/train_users_2.csv')
print data_df.shape

(213451, 16)


In [3]:
data_df.age.fillna(-1,inplace=True)

data_df.drop('date_first_booking',axis=1,inplace=True)
data_df.first_affiliate_tracked.fillna(-1,inplace=True)

data_df['date_account_created'] = pd.to_datetime(data_df['date_account_created'])
data_df['year_acc_created'] = data_df.date_account_created.dt.year
data_df['month_acc_created'] = data_df.date_account_created.dt.month
data_df['day_acc_created'] = data_df.date_account_created.dt.day
data_df['dayofweek_acc_created'] = data_df.date_account_created.dt.dayofweek
data_df['week_of_year_acc_created'] = data_df.date_account_created.dt.weekofyear

tmstmp = data_df['timestamp_first_active'].astype(str)
data_df['timestamp_first_active'] = tmstmp.apply(lambda x: datetime.strptime(x, "%Y%m%d%H%M%S"))
data_df['year_1st_tmstmp'] = data_df.timestamp_first_active.dt.year
data_df['month_1st_tmstmp'] = data_df.timestamp_first_active.dt.month
data_df['day_1st_tmstmp'] = data_df.timestamp_first_active.dt.day
data_df['hour_1st_tmstmp'] = data_df.timestamp_first_active.dt.hour
data_df['dayofyear_1st_tmstmp'] = data_df.timestamp_first_active.dt.dayofweek
data_df['week_of_year_1st_tmstmp'] = data_df.timestamp_first_active.dt.weekofyear

data_df.drop(['date_account_created', 'timestamp_first_active'],axis=1,inplace=True)

brow_cnt = zip(data_df['first_browser'].value_counts().index, data_df['first_browser'].value_counts().values)
brow_other = [x[0] for x in brow_cnt if x[1]<150]
data_df.first_browser = data_df.first_browser.apply(lambda x: 'Other' if x in brow_other else x)

affprov_cnt = zip(data_df['affiliate_provider'].value_counts().index,
                  data_df['affiliate_provider'].value_counts().values)
affprov_other = [x[0] for x in affprov_cnt if x[1]<100]
data_df.affiliate_provider = data_df.affiliate_provider.apply(lambda x: 'Other' if x in affprov_other else x)

In [4]:
age_group = []
age_bckts = [21,25,28,32,37,45,55,65,75]
for cell in data_df.age:
    if cell< 0:
        aggp = '-unknown-'
    elif (cell < 15) or (cell > 1900):
        if (cell< 15) or (cell>1998):#data is from 2014, minimum age for travel is assumed 15.
            aggp = '28-32'
        else:
            cell == 2014 - cell
    elif cell<=21:
            aggp = '15-21'
    elif cell>75:
            aggp = '75+'
    else:
        for i, agbkt in enumerate(age_bckts):
            if cell<=agbkt:
                aggp = str(age_bckts[i-1])+'-'+str(agbkt)
                break
    age_group.append(aggp)
data_df['age_bckts'] = age_group

In [5]:
y = data_df['country_destination']
data_df.drop(['country_destination','id'],axis=1,inplace=True)

In [6]:
data_df.age = data_df.age.astype(int,copy=False)

In [7]:
X = pd.get_dummies(data_df, dummy_na=True)

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Columns: 120 entries, age to age_bckts_nan
dtypes: int32(1), int64(12), uint8(107)
memory usage: 42.1 MB


In [13]:
# for col in X.columns:
# #     if X[col].dtype=='float64':
# #         print col
#     print col,X[col].dtype

In [9]:
print X.shape
print y.shape

(213451, 120)
(213451L,)


In [10]:
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    
    print 'cm: ','\n',cm
    print 'cr: ','\n',cr
    print a
    return a

all_models = {}

## KNN

In [12]:
a = evaluate_model(KNeighborsClassifier())

cm:  
[[    0     0     0     2     1     1     1   103     0     0    52     2]
 [    0     0     1     2     0     3     1   298     0     0   122     1]
 [    0     1     2     1     3     0     1   213     0     0    96     1]
 [    0     1     1     2     4     3     1   443     0     0   215     5]
 [    4     2     4     7    16     2     4  1009     0     0   449    10]
 [    0     1     2     4     8     3     1   466     0     0   205     7]
 [    2     1     3     6     7     1     4   597     1     1   225     3]
 [   18    45    26    69   155    46    53 30344     3     1  6481   122]
 [    0     1     0     0     1     0     1   150     0     0    75     1]
 [    0     0     0     1     0     0     0    47     0     0    16     1]
 [   15    41    25    62   140    32    31 12212     5     0  6042   108]
 [    2     7     6    10    15     2     9  2012     0     0   952    13]]
cr:  
             precision    recall  f1-score   support

         AU       0.00      0.00 

#### GridSearch

In [31]:
params = {'n_neighbors': range(2,40)}
gsknn = GridSearchCV(KNeighborsClassifier(),
                     params, cv=KFold(len(y), n_folds=3, shuffle=True))

In [32]:
gsknn.fit(X, y)

In [30]:
gsknn.best_params_

In [29]:
gsknn.best_score_

In [27]:
evaluate_model(gsknn.best_estimator_)

In [None]:
all_models['knn'] = {'model': gsknn.best_estimator_,
                     'score': a}

#### Bagging

In [5]:
from sklearn.ensemble import BaggingClassifier
baggingknn = BaggingClassifier(KNeighborsClassifier())

In [26]:
evaluate_model(baggingknn)

In [7]:
bagging_params = {'n_estimators': [10, 20],
                  'max_samples': [0.7, 1.0],
                  'max_features': [0.7, 1.0],
                  'bootstrap_features': [True, False]}


gsbaggingknn = GridSearchCV(baggingknn,
                            bagging_params, n_jobs=-1,
                            cv=KFold(len(y), n_folds=3, shuffle=True))

In [25]:
gsbaggingknn.fit(X, y)

In [24]:
gsbaggingknn.best_params_

In [23]:
all_models['gsbaggingknn'] = {'model': gsbaggingknn.best_estimator_,
                              'score': evaluate_model(gsbaggingknn.best_estimator_)}

##                                                         Decision Trees

In [22]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
all_models['dt'] = {'model': dt,
                    'score': evaluate_model(dt)}

for i in tqdm.tqdm(range(1000)):
    time.sleep(0.01)

#### GridSearch

In [20]:
params = {'criterion': ['gini', 'entropy'],
          'splitter': ['best', 'random'],
          'max_depth': [None, 5, 10],
          'min_samples_split': [2, 5],
          'min_samples_leaf': [1, 2, 3]}

gsdt = GridSearchCV(dt,
                    params, n_jobs=-1,
                    cv=KFold(len(y), n_folds=3, shuffle=True))

gsdt.fit(X, y)
print gsdt.best_params_
print gsdt.best_score_

all_models['gsdt'] = {'model': gsdt.best_estimator_,
                      'score': evaluate_model(gsdt.best_estimator_)}

#### Bagging

In [21]:
gsbaggingdt = GridSearchCV(BaggingClassifier(gsdt.best_estimator_),
                           bagging_params, n_jobs=-1,
                           cv=KFold(len(y), n_folds=3, shuffle=True))

gsbaggingdt.fit(X, y)

print gsbaggingdt.best_params_
print gsbaggingdt.best_score_

all_models['gsbaggingdt'] = {'model': gsbaggingdt.best_estimator_,
                             'score': evaluate_model(gsbaggingdt.best_estimator_)}

## Support Vector Machines

In [None]:
from sklearn.svm import SVC

svm = SVC()
all_models['svm'] = {'model': svm,
                     'score': evaluate_model(svm)}


#### GridSearch

In [None]:
params = {'C': [0.01, 0.1, 1.0, 10.0, 30.0, 100.0],
          'gamma': ['auto', 0.1, 1.0, 10.0],
          'kernel': ['linear', 'rbf']}


gssvm = GridSearchCV(svm,
                    params, n_jobs=-1,
                    cv=KFold(len(y), n_folds=3, shuffle=True))

gssvm.fit(X, y)
print gssvm.best_params_
print gssvm.best_score_

all_models['gssvm'] = {'model': gssvm.best_estimator_,
                      'score': evaluate_model(gssvm.best_estimator_)}

#### Bagging

In [None]:
gsbaggingsvm = GridSearchCV(BaggingClassifier(gssvm.best_estimator_),
                           bagging_params, n_jobs=-1,
                           cv=KFold(len(y), n_folds=3, shuffle=True))

gsbaggingsvm.fit(X, y)

print gsbaggingsvm.best_params_
print gsbaggingsvm.best_score_

all_models['gsbaggingsvm'] = {'model': gsbaggingsvm.best_estimator_,
                             'score': evaluate_model(gsbaggingsvm.best_estimator_)}

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

rf = RandomForestClassifier()
all_models['rf'] = {'model': rf,
                    'score': evaluate_model(rf)}


In [None]:
gsrf = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                    params, n_jobs=-1,
                    cv=KFold(len(y), n_folds=3, shuffle=True))

## XGBoost

In [14]:
from xgboost.sklearn import XGBClassifier

In [15]:
params1 = {
    'objective': 'binary:logistic',
    'max_depth': 5,
    'learning_rate': 0.25,
    'silent': 1.0,
    'n_estimators': 200
}

In [16]:
xgbst1 = XGBClassifier(**params1).fit(X_train, y_train)

In [17]:
prediction_xgbst1 = xgbst1.predict(X_test)

In [18]:
correct = 0

for i in range(len(prediction_xgbst1)):
    if (y_test.iloc[i] == prediction_xgbst1[i]):
        correct += 1
        
acc = accuracy_score(y_test, prediction_xgbst1)

print('Predicted correctly: {0}/{1}'.format(correct, len(prediction_xgbst1)))
print('Error: {0:.4f}'.format(1-acc))

Predicted correctly: 40648/64036
Error: 0.3652


In [19]:
all_models['xgbst1'] = {'model': xgbst1,
                    'score': evaluate_model(xgbst1)}

  'precision', 'predicted', average, warn_for)


cm:  
[[    0     0     0     0     0     0     0    88     0     0    74     0]
 [    0     0     0     0     0     0     0   220     0     0   208     0]
 [    0     0     0     0     0     0     0   174     0     0   144     0]
 [    0     0     0     0     0     0     0   376     0     0   299     0]
 [    0     0     0     0     0     0     0   809     0     0   698     0]
 [    0     0     0     0     0     0     0   391     0     0   306     0]
 [    0     0     0     0     0     0     0   489     0     0   362     0]
 [    0     0     1     1     3     1     0 31714     0     0  5639     4]
 [    0     0     0     0     0     0     0   127     0     0   102     0]
 [    0     0     0     0     0     0     0    36     0     0    29     0]
 [    0     1     0     0     7     1     0  9769     0     0  8931     4]
 [    0     0     0     1     0     0     0  1682     0     0  1342     3]]
cr:  
             precision    recall  f1-score   support

         AU       0.00      0.00 

In [22]:
params2 = {
    'objective': 'binary:logistic',
    'max_depth': 5,
    'learning_rate': 0.05,
    'silent': 1.0,
    'n_estimators': 750,
    'subsample': 0.7,
    'colsample_bytree':0.7,
    'reg_alpha':0,
    'reg_lambda':1.
}

In [23]:
xgbst2 = XGBClassifier( **params2).fit(X_train, y_train)

In [24]:
prediction_xgbst2 = xgbst2.predict(X_test)

In [25]:
correct = 0

for i in range(len(prediction_xgbst2)):
    if (y_test.iloc[i] == prediction_xgbst2[i]):
        correct += 1
        
acc = accuracy_score(y_test, prediction_xgbst2)

print('Predicted correctly: {0}/{1}'.format(correct, len(prediction_xgbst2)))
print('Error: {0:.4f}'.format(1-acc))

Predicted correctly: 13847/20955
Error: 0.3392


In [26]:
all_models['xgbst1'] = {'model': xgbst2,
                    'score': evaluate_model(xgbst2)}

0.660796945836
