In [60]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from pandas_summary import DataFrameSummary
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import metrics

In [61]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")
del train['train_id']
id = test['test_id']
del test['test_id']
train.shape, test.shape

((18255, 1234), (27285, 1233))

In [62]:
col = train.isnull().sum().sort_values(ascending = False)<0.99*train.shape[0]
train = train.loc[:,col]
train.head(3)

Unnamed: 0,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,DG3,...,LN2_2,LN2_3,LN2_4,LN2_RIndLngBEOth,LN2_WIndLngBEOth,GN1,GN2,GN3,GN4,GN5
0,3,32,3.0,,323011,3854,481,1975,1,3,...,1,1,1,,,99.0,99,99,99,99
1,2,26,,8.0,268131,2441,344,1981,1,8,...,1,3,4,Bengali,Bengali,,1,2,2,2
2,1,16,,7.0,167581,754,143,1995,1,3,...,1,2,2,Hindi,Hindi,1.0,2,2,2,2


In [63]:

index = col[(col == True)].index
test = test.loc[:,index]
del test['is_female']
test.shape, train.shape

((27285, 699), (18255, 700))

### Convert all float into int

In [64]:
col2 = list(train.select_dtypes(include=['object']).columns)
col2

['DL1_OTHERS',
 'DL2_96_OTHERS',
 'DL4_OTHERS',
 'MM12_REC',
 'FL9A_OTHERS',
 'FB19_6_OTHERS',
 'LN2_RIndLngBEOth',
 'LN2_WIndLngBEOth']

In [65]:
train.loc[:,col2] = train.loc[:,col2].fillna("missing")
test.loc[:,col2] = test.loc[:,col2].fillna("missing")

In [66]:
train["row_nulls"] = train.isnull().sum(axis=1)/float(train.shape[1])
test["row_nulls"] = test.isnull().sum(axis=1)/float(test.shape[1])
train.fillna(1000,inplace=True)
test.fillna(1000,inplace=True)

In [67]:
float_col = list(train.select_dtypes(include=['float64']).columns)

In [68]:
train[float_col] = train[float_col].astype("int64")
test[float_col] = test[float_col].astype("int64")

In [69]:
train.head(3)

Unnamed: 0,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,DG3,...,LN2_3,LN2_4,LN2_RIndLngBEOth,LN2_WIndLngBEOth,GN1,GN2,GN3,GN4,GN5,row_nulls
0,3,32,3,1000,323011,3854,481,1975,1,3,...,1,1,missing,missing,99,99,99,99,99,0
1,2,26,1000,8,268131,2441,344,1981,1,8,...,3,4,Bengali,Bengali,1000,1,2,2,2,0
2,1,16,1000,7,167581,754,143,1995,1,3,...,2,2,Hindi,Hindi,1,2,2,2,2,0


In [70]:
col = list(train.select_dtypes(include=['object','int64']).columns)
v = (train[col].apply(lambda x: x.nunique(), axis=0)>3).reset_index()
col_filtered = list(v.loc[(v.iloc[:,1]==True),'index'])
u = (train[col_filtered].nunique()<11).reset_index()
col_filtered = list(u.loc[(u.iloc[:,1]==True),'index'])

### We can do target mean encoding on all these object columns

In [71]:
def mean_encoding(df, test, col):
    for item in col:
        mean_device_type = df.groupby(item).is_female.mean()
        test[item+"_mean_enc"] = test[item].map(mean_device_type)
        df[item+"_mean_enc"] = df[item].map(mean_device_type)
    return df,test

In [72]:
def process_data(train,test):
    combine = pd.concat([train,test])
    object_cols = list(test.select_dtypes(include=['object','category']).columns)
    for c in object_cols:
        combine[c] = combine[c].astype('category').cat.codes+1  
    train = combine.iloc[:train.shape[0],]
    test = combine.iloc[train.shape[0]:,]
    return train,test

In [73]:
train,test = mean_encoding(train, test, col_filtered)
train,test = process_data(train,test)

In [74]:
global_mean = train.is_female.mean()
test.fillna(global_mean, inplace=True)

In [75]:
del test['is_female']
test.head(3), train.shape

(   AA14  AA15  AA3  AA3_mean_enc  AA4   AA5  AA5_mean_enc   AA6  AA6_mean_enc  \
 0  4479   535    4      0.541462   41  1000      0.538803     7      0.534333   
 1  3803   476    3      0.476144   32     2      0.524664  1000      0.533345   
 2  5610   585    3      0.476144   36     5      0.528686  1000      0.533345   
 
       AA7    ...      MT6B  MT6B_mean_enc  MT6C  MT6_mean_enc   MT7  MT7A  \
 0  417211    ...      1000       0.701275  1000      0.701275     2  1000   
 1  322011    ...         1       0.363453     0      0.179292  1000  1000   
 2  365011    ...      1000       0.701275  1000      0.701275     2  1000   
 
    MT7A_mean_enc   MT8   MT9  row_nulls  
 0       0.488904  1000    12          0  
 1       0.488904  1000  1000          0  
 2       0.488904  1000    13          0  
 
 [3 rows x 888 columns], (18255, 889))

In [76]:
y = train['is_female']
X = train.drop(['is_female'], axis = 1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=10)

In [77]:
def auc(m, train, val): 
    return (roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            roc_auc_score(y_val,m.predict_proba(val)[:,1]))

In [25]:
model = xgb.XGBClassifier()
param_dist = {"max_depth": [5, 10, 15, 20],
              "min_child_weight" : [5,10,15,20],
             "n_estimators" : [100, 200]
             }

# # run randomized search
# n_iter_search = 15
random_search = GridSearchCV(model, n_jobs=-1, param_grid=param_dist, cv = 5, scoring='roc_auc', verbose=1)
# random_search = RandomizedSearchCV(model, param_distributions=param_dist, cv = 5, n_iter=n_iter_search, scoring='roc_auc', random_state=0, n_jobs=-1)
# random_search.fit(X_train, y_train)
# random_search.best_estimator_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [15], 'min_child_weight': [5], 'n_estimators': [100], 'learning_rate': [0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [19]:
random_search.grid_scores_


NameError: name 'random_search' is not defined

In [78]:
random_search = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=15,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [79]:
from sklearn.metrics import roc_auc_score

In [80]:
random_search.fit(X_train, y_train)
pred_val = random_search.predict_proba(X_val)[:,1]
print(pred_val[0:10])
pred_test = random_search.predict_proba(test)[:,1]
print(pred_test[0:10])
roc_auc_score(y_val,pred_val)

[ 0.99856669  0.00926302  0.01287946  0.99801624  0.9789685   0.99516934
  0.00695484  0.50977713  0.98463559  0.34159023]
[ 0.99921274  0.04332031  0.9833113   0.98809183  0.71574455  0.9958331
  0.13863802  0.99042928  0.00456238  0.20806153]


0.96958855959457024

In [81]:
pred_val = pd.DataFrame(pred_val)
pred_val.to_csv('val_xgb_11_non_mode.csv', index = None)
# sub = pd.read_csv('../sample_submission.csv')
# sub.is_female = pred_test
# sub.to_csv('test_xgb_11.csv', index = None)

In [24]:
# XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
#        gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=15,
#        min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
#        objective='binary:logistic', reg_alpha=0, reg_lambda=1,
#        scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [45]:
random_search.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=15,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)>

In [49]:
random_search.fit(X, y)
pred_test = random_search.predict_proba(test)[:,1]
print(pred_test[0:10])

[ 0.99842095  0.0268812   0.97279704  0.98857182  0.58761621  0.99531639
  0.03465364  0.97456282  0.0027916   0.08267953]


In [26]:
sub = pd.read_csv('../sample_submission.csv')
sub.is_female = pred_test
sub.to_csv('test_xgb_11_full_mode_imp.csv', index = None)

In [30]:
# y_val = pd.DataFrame(y_val)
y_val.to_csv('y_val.csv', index=None)

In [30]:
f1 = pd.read_csv('submission_ensemble2.csv')
f1.head()
mean_pred = np.mean([f1['is_female'], sub['is_female']], axis = 0)

In [32]:
sub = pd.read_csv('../sample_submission.csv')
sub.head()

sub.is_female = mean_pred

sub.to_csv('sub_xgb_int_21_ensemble_with_best.csv', index = None)