In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn import model_selection, tree, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier, LGBMRegressor
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv(r"C:\Users\panda4\Documents\Data Science\Online\train_8wry4cB.csv")
data

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002/B00003/C00006/D28435/;A00002/B00003/C00...,female
1,u10253,16/12/14 14:35,16/12/14 14:41,A00001/B00009/C00031/D29404/;A00001/B00009/C00...,male
2,u19037,01/12/14 15:58,01/12/14 15:58,A00002/B00001/C00020/D16944/,female
3,u14556,23/11/14 2:57,23/11/14 3:00,A00002/B00004/C00018/D10284/;A00002/B00004/C00...,female
4,u24295,17/12/14 16:44,17/12/14 16:46,A00001/B00001/C00012/D30805/;A00001/B00001/C00...,male
5,u20964,12/12/14 15:31,12/12/14 15:31,A00002/B00002/C00081/D25039/,female
6,u20981,05/12/14 19:33,05/12/14 19:34,A00002/B00011/C00180/D18487/;A00002/B00011/C00...,female
7,u15947,09/12/14 16:02,09/12/14 16:06,A00002/B00001/C00059/D15254/;A00002/B00001/C00...,male
8,u18193,25/11/14 11:55,25/11/14 11:55,A00002/B00001/C00010/D06129/,male
9,u21820,06/12/14 16:16,06/12/14 16:17,A00002/B00001/C00010/D12841/;A00002/B00002/C00...,female


In [2]:
train_data = \
(data.set_index(data.columns.drop('ProductList',1).tolist())
   .ProductList.str.split('/', expand=True)
   .stack()
   .reset_index()
   .rename(columns={0:'ProductList'})
   .loc[:, data.columns]
)
train_data['ProductList'] = train_data['ProductList'].str.replace(';','')
train_data.drop_duplicates(inplace=True)
train_data['ProductList'].replace('', np.nan, inplace=True)
train_data.dropna(inplace=True)
train_data.reset_index(drop=True)

Unnamed: 0,session_id,startTime,endTime,ProductList,gender
0,u16159,15/12/14 18:11,15/12/14 18:12,A00002,female
1,u16159,15/12/14 18:11,15/12/14 18:12,B00003,female
2,u16159,15/12/14 18:11,15/12/14 18:12,C00006,female
3,u16159,15/12/14 18:11,15/12/14 18:12,D28435,female
4,u16159,15/12/14 18:11,15/12/14 18:12,D02554,female
5,u16159,15/12/14 18:11,15/12/14 18:12,D28436,female
6,u16159,15/12/14 18:11,15/12/14 18:12,D28437,female
7,u10253,16/12/14 14:35,16/12/14 14:41,A00001,male
8,u10253,16/12/14 14:35,16/12/14 14:41,B00009,male
9,u10253,16/12/14 14:35,16/12/14 14:41,C00031,male


In [3]:
train_data['startTime'] = pd.to_datetime(train_data['startTime'])
train_data['endTime'] = pd.to_datetime(train_data['endTime'])
train_data['startTime_day'] = train_data['startTime'].dt.day
train_data['startTime_month'] = train_data['startTime'].dt.month
train_data['startTime_year'] = train_data['startTime'].dt.year
train_data['startTime_hour'] = train_data['startTime'].dt.hour
train_data['startTime_minute'] = train_data['startTime'].dt.minute
train_data['endTime_day'] = train_data['endTime'].dt.day
train_data['endTime_month'] = train_data['endTime'].dt.month
train_data['endTime_year'] = train_data['endTime'].dt.year
train_data['endTime_hour'] = train_data['endTime'].dt.hour
train_data['endTime_minute'] = train_data['endTime'].dt.minute
train_data['session_id'] = train_data['session_id'].str.replace('u','')
train_data['ProductList'] = train_data['ProductList'].str[1:]
train_data.reset_index(drop=True, inplace=True)

In [4]:
X = train_data.drop(['startTime', 'endTime', 'gender'], axis=1)
X

Unnamed: 0,session_id,ProductList,startTime_day,startTime_month,startTime_year,startTime_hour,startTime_minute,endTime_day,endTime_month,endTime_year,endTime_hour,endTime_minute
0,16159,00002,15,12,2014,18,11,15,12,2014,18,12
1,16159,00003,15,12,2014,18,11,15,12,2014,18,12
2,16159,00006,15,12,2014,18,11,15,12,2014,18,12
3,16159,28435,15,12,2014,18,11,15,12,2014,18,12
4,16159,02554,15,12,2014,18,11,15,12,2014,18,12
5,16159,28436,15,12,2014,18,11,15,12,2014,18,12
6,16159,28437,15,12,2014,18,11,15,12,2014,18,12
7,10253,00001,16,12,2014,14,35,16,12,2014,14,41
8,10253,00009,16,12,2014,14,35,16,12,2014,14,41
9,10253,00031,16,12,2014,14,35,16,12,2014,14,41


In [5]:
y = train_data['gender'].map({'female':1, 'male':0})
y = pd.DataFrame(y, columns=['gender'])
y

Unnamed: 0,gender
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,0
8,0
9,0


In [6]:
test_size = 0.3
seed = 3
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X, y, test_size = test_size, random_state = seed)

In [7]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [8]:
rfc_predict = rfc.predict(X_valid)

In [9]:
rfc_cv_score = cross_val_score(rfc, X, y, cv=10, scoring='accuracy')

In [10]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_valid, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_valid, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())
print(accuracy_score(y_valid, rfc_predict))

=== Confusion Matrix ===
[[ 3103   800]
 [  224 13995]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.93      0.80      0.86      3903
           1       0.95      0.98      0.96     14219

    accuracy                           0.94     18122
   macro avg       0.94      0.89      0.91     18122
weighted avg       0.94      0.94      0.94     18122



=== All AUC Scores ===
[0.8097997  0.81509684 0.82320808 0.79903989 0.80086078 0.80960265
 0.80331126 0.80546358 0.80115894 0.80711921]


=== Mean AUC Score ===
Mean AUC Score - Random Forest:  0.8074660926275309
0.9434940955744399


In [28]:
data_test = pd.read_csv(r"C:\Users\panda4\Documents\Data Science\Online\test_Yix80N0.csv")
data_test

Unnamed: 0,session_id,startTime,endTime,ProductList
0,u12112,08/12/14 13:36,08/12/14 13:36,A00002/B00003/C00006/D19956/
1,u19725,19/12/14 13:52,19/12/14 13:52,A00002/B00005/C00067/D02026/
2,u11795,01/12/14 10:44,01/12/14 10:44,A00002/B00002/C00004/D12538/
3,u22639,08/12/14 20:19,08/12/14 20:22,A00002/B00003/C00079/D22781/;A00002/B00003/C00...
4,u18034,15/12/14 19:33,15/12/14 19:33,A00002/B00001/C00010/D23419/
5,u11327,27/11/14 8:41,27/11/14 8:47,A00001/B00009/C00028/D13690/;A00001/B00009/C00...
6,u12768,26/11/14 21:23,26/11/14 21:24,A00001/B00031/C00044/D13454/
7,u17011,25/11/14 20:02,25/11/14 20:03,A00001/B00009/C00028/D11956/
8,u13527,20/12/14 16:26,20/12/14 16:26,A00002/B00003/C00005/D26497/
9,u24492,18/12/14 13:15,18/12/14 13:19,A00001/B00001/C00019/D04978/;A00001/B00001/C00...


In [29]:
test_data = \
(data_test.set_index(data_test.columns.drop('ProductList',1).tolist())
   .ProductList.str.split('/', expand=True)
   .stack()
   .reset_index()
   .rename(columns={0:'ProductList'})
   .loc[:, data_test.columns]
)

test_data['ProductList'] = test_data['ProductList'].str.replace(';','')
test_data.drop_duplicates(inplace=True)
test_data['ProductList'].replace('', np.nan, inplace=True)
test_data.dropna(inplace=True)
test_data.reset_index(drop=True)
test_data

Unnamed: 0,session_id,startTime,endTime,ProductList
0,u12112,08/12/14 13:36,08/12/14 13:36,A00002
1,u12112,08/12/14 13:36,08/12/14 13:36,B00003
2,u12112,08/12/14 13:36,08/12/14 13:36,C00006
3,u12112,08/12/14 13:36,08/12/14 13:36,D19956
5,u19725,19/12/14 13:52,19/12/14 13:52,A00002
6,u19725,19/12/14 13:52,19/12/14 13:52,B00005
7,u19725,19/12/14 13:52,19/12/14 13:52,C00067
8,u19725,19/12/14 13:52,19/12/14 13:52,D02026
10,u11795,01/12/14 10:44,01/12/14 10:44,A00002
11,u11795,01/12/14 10:44,01/12/14 10:44,B00002


In [30]:
test_data['startTime'] = pd.to_datetime(test_data['startTime'])
test_data['endTime'] = pd.to_datetime(test_data['endTime'])
test_data['startTime_day'] = test_data['startTime'].dt.day
test_data['startTime_month'] = test_data['startTime'].dt.month
test_data['startTime_year'] = test_data['startTime'].dt.year
test_data['startTime_hour'] = test_data['startTime'].dt.hour
test_data['startTime_minute'] = test_data['startTime'].dt.minute
test_data['endTime_day'] = test_data['endTime'].dt.day
test_data['endTime_month'] = test_data['endTime'].dt.month
test_data['endTime_year'] = test_data['endTime'].dt.year
test_data['endTime_hour'] = test_data['endTime'].dt.hour
test_data['endTime_minute'] = test_data['endTime'].dt.minute
test_data['session_id'] = test_data['session_id'].str.replace('u','')
test_data['ProductList'] = test_data['ProductList'].str[1:]
test_data.reset_index(drop=True, inplace=True)

In [31]:
X_test = test_data.drop(['startTime', 'endTime'], axis=1)
X_test

Unnamed: 0,session_id,ProductList,startTime_day,startTime_month,startTime_year,startTime_hour,startTime_minute,endTime_day,endTime_month,endTime_year,endTime_hour,endTime_minute
0,12112,00002,12,8,2014,13,36,12,8,2014,13,36
1,12112,00003,12,8,2014,13,36,12,8,2014,13,36
2,12112,00006,12,8,2014,13,36,12,8,2014,13,36
3,12112,19956,12,8,2014,13,36,12,8,2014,13,36
4,19725,00002,19,12,2014,13,52,19,12,2014,13,52
5,19725,00005,19,12,2014,13,52,19,12,2014,13,52
6,19725,00067,19,12,2014,13,52,19,12,2014,13,52
7,19725,02026,19,12,2014,13,52,19,12,2014,13,52
8,11795,00002,12,1,2014,10,44,12,1,2014,10,44
9,11795,00002,12,1,2014,10,44,12,1,2014,10,44


In [33]:
pred = rfc.predict(X_test)
pred


session = data_test['session_id']
subm = pd.DataFrame(pred, columns=['gender'])
subm1 = pd.concat([session, subm], axis=1)
subm1.dropna(inplace=True)
subm1.drop_duplicates(subset = 'session_id', keep="last", inplace=True)
subm1['gender'] = subm1['gender'].map({1:'female', 0:'male'})
subm1.reset_index(drop=True, inplace=True)
subm1.to_csv('submission.csv', index=False)
subm1

Unnamed: 0,session_id,gender
0,u12112,female
1,u19725,female
2,u11795,female
3,u22639,female
4,u18034,female
5,u11327,female
6,u12768,female
7,u17011,female
8,u13527,female
9,u24492,female
