In [1]:
import sys
sys.path.append("/Users/Bing/Documents/DS/DrivenData/Pover-T/Scripts/") # need to add path to the parent folder where CV.py is

In [2]:
import pandas as pd
import numpy as np
from PoverTHelperTools import *
from PoverTCV import *


from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.metrics import log_loss




In [3]:
hhold_a_train, hhold_b_train, hhold_c_train = load_hhold_train()
hhold_a_test, hhold_b_test, hhold_c_test = load_hhold_test()

indiv_a_train, indiv_b_train, indiv_c_train = load_indiv_train()
indiv_a_test, indiv_b_test, indiv_c_test = load_indiv_test()

In [None]:
hhold_b_train.head()


In [None]:
def standardize(df):
    numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtract mean and divide by std
    df[numeric.columns] = (numeric- numeric.mean()) / numeric.std()
    
    return df

def pre_process_data(df, enforce_cols = None):
    """
    Standardize the numeric columns and one hot encode the categorical columns
    """
    
    print('Input shape:\t{}'.format(df.shape))
    
    df = standardize(df)
    print('After standardization {}'.format(df.shape))
    
    # get dummies for categoricals
    df = pd.get_dummies(df)
    print('After converting categoricals:\t{}'.format(df.shape))
    df.info()
    
    # match test set and training set columns
    if enforce_cols is not None:
        to_drop = np.setdiff1d(df.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, df.columns)

        df.drop(to_drop, axis=1, inplace=True)
        df = df.assign(**{c: 0 for c in to_add})
    
    
    # just fill NaN with 0 for now. Later try fill it with mean or explore more to figure out how to best impute
    df.fillna(0, inplace = True) 
    
    
    return df    

import lightgbm as lgb
lgb_params = {'objective': 'binary', 
          'learning_rate': '0.1',
          'num_threads':'4'
          }

def train_lgb(X,y):
#     print('Training lgb model...')
    dstrain = lgb.Dataset(X, label = y)
#     model = lgb.train(params, dstrain, valid_sets = dstrain,
#                       num_boost_round = 8000, verbose_eval = True)
    model = lgb.LGBMClassifier(n_estimators = 10,
                              objective = 'binary', 
                              num_threads = 4,
                              learning_rate = 0.05)
#     model.fit(X,y)
    
#     print('Done training!')
    return model

In [None]:
X = hhold_c_train.drop(['poor', 'country'], axis = 1)
X = pre_process_data(X)
y = hhold_c_train['poor'].values.astype(int)

n = 3
skf = StratifiedKFold(n_splits=n)
for i, (train_index, val_index) in enumerate(skf.split(X,y)):
#     print(i, train_index, val_index)
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Models here
    
    print('*********\nTraining model on Fold {}'.format(i))
    model = train_lgb(X_train, y_train)
    model.fit(X_train, y_train)
    
    preds = model.predict_proba(X_val)
    logloss = log_loss(y_true = y_val, y_pred= preds)
    print('\nLog Loss for Fold {} : {}\n'.format(i,logloss))
    
    
    

    

In [None]:
def train_lgb(X,y):
#     print('Training lgb model...')
    dstrain = lgb.Dataset(X, label = y)
#     model = lgb.train(params, dstrain, valid_sets = dstrain,
#                       num_boost_round = 8000, verbose_eval = True)
    model = lgb.LGBMClassifier(n_estimators = 10,
                              objective = 'binary', 
                              num_threads = 4,
                              learning_rate = 0.05)
#     model.fit(X,y)
    
#     print('Done training!')
    return model

In [None]:
hhold_a_train['poor'].values.astype(int)

In [4]:
## using 50 estimators, submitted to LB and got 0.46511
## this was improvement over 10 estimators which was 0.51925
LGB = lgb.LGBMClassifier(n_estimators = 50,
                              objective = 'binary', 
                              num_threads = 4,
                              learning_rate = 0.05)
StratifiedKF(hhold_a_train, n_splits = 5, model = LGB)
StratifiedKF(hhold_b_train, n_splits = 5, model = LGB)
StratifiedKF(hhold_c_train, n_splits = 5, model = LGB)

Input shape:	(8203, 343)
After standardization (8203, 343)
After converting categoricals:	(8203, 858)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8203 entries, 46107 to 39832
Columns: 858 entries, nEsgxvAq to AlDbXTlZ_cecIq
dtypes: float64(4), uint8(854)
memory usage: 7.0 MB
Training model on Fold 0

Train Log Loss for Fold 0: 0.30031393913071797
Validation Log Loss for Fold 0: 0.3419795520828201

Training model on Fold 1

Train Log Loss for Fold 1: 0.2992099251190147
Validation Log Loss for Fold 1: 0.3466854565389969

Training model on Fold 2

Train Log Loss for Fold 2: 0.2958701127076676
Validation Log Loss for Fold 2: 0.35914346374978195

Training model on Fold 3

Train Log Loss for Fold 3: 0.30015379419319776
Validation Log Loss for Fold 3: 0.346151772180263

Training model on Fold 4

Train Log Loss for Fold 4: 0.2980108820786507
Validation Log Loss for Fold 4: 0.3579809284676522

Input shape:	(3255, 440)
After standardization (3255, 440)
After converting categoricals:	(3255,

In [5]:
## using 50 estimators, submitted to LB and got 0.46511
## this was improvement over 10 estimators which was 0.51925
LGB = lgb.LGBMClassifier(n_estimators = 100,
                              objective = 'binary', 
                              num_threads = 4,
                              learning_rate = 0.05
                              )
StratifiedKF(hhold_a_train, n_splits = 5, model = LGB)
StratifiedKF(hhold_b_train, n_splits = 5, model = LGB)
StratifiedKF(hhold_c_train, n_splits = 5, model = LGB)

Input shape:	(8203, 343)
After standardization (8203, 343)
After converting categoricals:	(8203, 858)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8203 entries, 46107 to 39832
Columns: 858 entries, nEsgxvAq to AlDbXTlZ_cecIq
dtypes: float64(4), uint8(854)
memory usage: 7.0 MB
Training model on Fold 0

Train Log Loss for Fold 0: 0.20732952243816363
Validation Log Loss for Fold 0: 0.2951030977329212

Training model on Fold 1

Train Log Loss for Fold 1: 0.2076755199597848
Validation Log Loss for Fold 1: 0.2950823640716657

Training model on Fold 2

Train Log Loss for Fold 2: 0.2054084064077528
Validation Log Loss for Fold 2: 0.3033640988838772

Training model on Fold 3

Train Log Loss for Fold 3: 0.20859026080339288
Validation Log Loss for Fold 3: 0.29284094890326373

Training model on Fold 4

Train Log Loss for Fold 4: 0.20664344989476197
Validation Log Loss for Fold 4: 0.30450283977840614

Input shape:	(3255, 440)
After standardization (3255, 440)
After converting categoricals:	(32

In [6]:
from sklearn.linear_model import LogisticRegression
StratifiedKF(hhold_a_train, n_splits = 5, model = LogisticRegression())

Input shape:	(8203, 343)
After standardization (8203, 343)
After converting categoricals:	(8203, 858)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8203 entries, 46107 to 39832
Columns: 858 entries, nEsgxvAq to AlDbXTlZ_cecIq
dtypes: float64(4), uint8(854)
memory usage: 7.0 MB
Training model on Fold 0

Train Log Loss for Fold 0: 0.22823990017457374
Validation Log Loss for Fold 0: 0.3059116912596034

Training model on Fold 1

Train Log Loss for Fold 1: 0.23005498987206427
Validation Log Loss for Fold 1: 0.3057219540590821

Training model on Fold 2

Train Log Loss for Fold 2: 0.23532374467478295
Validation Log Loss for Fold 2: 0.27715063779644333

Training model on Fold 3

Train Log Loss for Fold 3: 0.23115459640498226
Validation Log Loss for Fold 3: 0.2995294605843794

Training model on Fold 4

Train Log Loss for Fold 4: 0.22802966936431865
Validation Log Loss for Fold 4: 0.31777206866535707



In [None]:
StratifiedKF(hhold_b_train, n_splits = 5, model = LogisticRegression())

In [None]:
StratifiedKF(hhold_c_train, n_splits = 10, model = LogisticRegression())

In [None]:
### The log loss on the leaderboard for Logistic Regression is 3.2 but on my CV here it is like 0.18
### what's going on???