In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics

In [2]:
train = pd.read_csv('../train.csv', low_memory=False)
test = pd.read_csv('../test.csv', low_memory=False)

In [3]:
train.shape, train.head()

((18255, 1235),
    train_id  AA3  AA4  AA5  AA6     AA7  AA14  AA15   DG1  is_female  \
 0         0    3   32  3.0  NaN  323011  3854   481  1975          1   
 1         1    2   26  NaN  8.0  268131  2441   344  1981          1   
 2         2    1   16  NaN  7.0  167581   754   143  1995          1   
 3         3    4   44  5.0  NaN  445071  5705   604  1980          1   
 4         4    4   43  NaN  6.0  436161  5645   592  1958          1   
 
       ...       GN1  GN1_OTHERS GN2  GN2_OTHERS  GN3  GN3_OTHERS  GN4  \
 0     ...      99.0         NaN  99         NaN   99         NaN   99   
 1     ...       NaN         NaN   1         NaN    2         NaN    2   
 2     ...       1.0         NaN   2         NaN    2         NaN    2   
 3     ...       NaN         NaN   2         NaN    2         NaN   99   
 4     ...       NaN         NaN   1         NaN    1         NaN    1   
 
    GN4_OTHERS  GN5  GN5_OTHERS  
 0         NaN   99         NaN  
 1         NaN    2         Na

In [4]:
test.shape, test.head()

((27285, 1234),
    test_id  AA3  AA4  AA5  AA6     AA7  AA14  AA15   DG1  DG3     ...      \
 0        0    4   41  NaN  7.0  417211  4479   535  1979    8     ...       
 1        1    3   32  2.0  NaN  322011  3803   476  1993    1     ...       
 2        2    3   36  5.0  NaN  365011  5610   585  1980    3     ...       
 3        3    2   24  NaN  7.0  247061  2550   350  1991    3     ...       
 4        4    3   35  NaN  8.0  358071  3233   400  1985    3     ...       
 
    GN1 GN1_OTHERS  GN2 GN2_OTHERS  GN3  GN3_OTHERS  GN4  GN4_OTHERS  GN5  \
 0  2.0        NaN    1        NaN    3         NaN    3         NaN    3   
 1  1.0        NaN    1        NaN    1         NaN    1         NaN    1   
 2  2.0        NaN    2        NaN    2         NaN    2         NaN    2   
 3  2.0        NaN    2        NaN    2         NaN    2         NaN    2   
 4  1.0        NaN    1        NaN    1         NaN    1         NaN    1   
 
    GN5_OTHERS  
 0         NaN  
 1         NaN  

In [5]:
cols = train.select_dtypes('object').columns
new_train = train.drop(cols, axis = 1)
new_test = test.drop(cols, axis = 1)

In [6]:
cols = new_test.select_dtypes('object').columns
new_test = new_test.drop(cols, axis = 1)
new_train = new_train.drop(cols, axis = 1)

In [7]:
y = new_train['is_female']
X = new_train.drop(['is_female'], axis = 1)

In [8]:
X.drop(['train_id'], axis = 1, inplace = True)
new_test.drop(['test_id'], axis = 1, inplace = True)

In [9]:
X.shape

(18255, 1122)

In [10]:
X['rowsum_not_na'] = X.apply(lambda x: x.count(), axis=1)
new_test['rowsum_not_na'] = new_test.apply(lambda x: x.count(), axis=1)

In [11]:
X.fillna(0, inplace=True)
X[X == 'Inf'] = 0

In [12]:
new_test.fillna(0, inplace=True)
new_test[new_test == 'Inf'] = 0

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=10)

In [14]:
X_train.shape, X_train.head()

((14604, 1123),
        AA3  AA4  AA5  AA6     AA7   AA14  AA15   DG1  DG3  DG3A  \
 17130    3   32  0.0  6.0  326021   3854   481  1956    3     4   
 8500     4   44  0.0  6.0  446171   5720   605  1976    3     4   
 2979     4   42  1.0  0.0  421011  99999   572  1996    3     4   
 3084     2   23  5.0  0.0  235021   1461   234  1998    1     2   
 13128    1   16  0.0  7.0  167041    850   164  1978    3     4   
 
            ...        LN2_1  LN2_2  LN2_3  LN2_4  GN1  GN2  GN3  GN4  GN5  \
 17130      ...            1      1      4      4  1.0    2    1    1    1   
 8500       ...            5      5      4      4  3.0    3    3    3    3   
 2979       ...            4      4      4      4  0.0    1    2    2    2   
 3084       ...            3      3      4      4  4.0    4    4    4    4   
 13128      ...            1      1      4      4  2.0    2    2    2    2   
 
        rowsum_not_na  
 17130            418  
 8500             416  
 2979             313  
 3084   

In [None]:
rf = RandomForestClassifier()

In [None]:
grid = {'min_samples_leaf' : [3,5, 10], 'max_features' : ['auto', 'log2', 'sqrt'], 'n_estimators':[100]
       }
clf = GridSearchCV(rf, grid, cv=5)
clf.fit(X_train, y_train)
clf.best_estimator_

In [None]:
pred_val = clf.predict_proba(X_val)[:,1]
pred_val

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, pred_val, pos_label=1)
metrics.auc(fpr, tpr)

In [None]:
new_test.head()

In [None]:
new_test.dtypes

In [None]:
pred_test = clf.predict_proba(new_test)[:,1]

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub.head()

In [None]:
sub.is_female = pred_test

In [None]:
sub.to_csv('sub1.csv', index = None)

In [15]:
import xgboost as xgb



In [16]:
model = xgb.XGBClassifier()
param_dist = {"max_depth": [3, 5, 10, 15],
              "min_child_weight" : [3,5,10,20],
             "n_estimators" : [100]}

# run randomized search
n_iter_search = 3
random_search = RandomizedSearchCV(model, param_distributions=param_dist, cv = 5, n_iter=n_iter_search)
random_search.fit(X_train, y_train)
random_search.best_estimator_

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [17]:
pred_val = random_search.predict_proba(X_val)[:,1]
pred_val

array([ 0.99779838,  0.00731598,  0.01547817, ...,  0.00453813,
        0.36851692,  0.99749714], dtype=float32)

In [18]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, pred_val, pos_label=1)
metrics.auc(fpr, tpr)

0.96835363069097657

In [19]:
pred_test = random_search.predict_proba(new_test)[:,1]

In [21]:
sub = pd.read_csv('../sample_submission.csv')
sub.head()

Unnamed: 0,test_id,is_female
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1


In [22]:
sub.is_female = pred_test

In [23]:
sub.to_csv('sub_xgb_baseline_tmp.csv', index = None)

In [22]:
import numpy as np

i = np.ones([1,4])
j = np.ones([1,4])
k = np.ones([1,4])
l = np.ones([1,4])

A = [i,j,k,l]

A

[[i,i,i,i]  for i,e in enumerate(A)]

[[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3]]

array([[[[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]],

        [[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]],

        [[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]]],


       [[[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]],

        [[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]],

        [[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]]],


       [[[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]],

        [[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]],

        [[ 1.,  1.,  1.],
         [ 1.,  1.,  1.],
         [ 1.,  1.,  1.]]]])