### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders.target_encoder import TargetEncoder
import scipy

#Disable Warnings
import warnings
warnings.filterwarnings("ignore")

### Import dataset

In [2]:
def read_file(path):
    return pd.read_csv(path)
train = read_file(path='data/train.csv')
test = read_file(path='data/test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   country                 23524 non-null  object
 1   year                    23524 non-null  int64 
 2   uniqueid                23524 non-null  object
 3   bank_account            23524 non-null  object
 4   location_type           23524 non-null  object
 5   cellphone_access        23524 non-null  object
 6   household_size          23524 non-null  int64 
 7   age_of_respondent       23524 non-null  int64 
 8   gender_of_respondent    23524 non-null  object
 9   relationship_with_head  23524 non-null  object
 10  marital_status          23524 non-null  object
 11  education_level         23524 non-null  object
 12  job_type                23524 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB


In [4]:
#Combine both train and test sets for easy wrangling
all_data = pd.concat([train,test]).reset_index(drop=True)

#Feature Engineering
all_data["bank_account"].replace({"Yes":1,"No":0},inplace=True)
all_data["cellphone_access"].replace({"Yes":1,"No":0},inplace=True)
all_data["location_type"].replace({"Rural":0,"Urban":1},inplace=True)


n_train = train.shape[0]
n_test = test.shape[0]
train = all_data.iloc[:n_train]
test = all_data.iloc[-n_test:].reset_index(drop=True)

train['bank_account'] = train['bank_account'].astype(np.int64)
target = train['bank_account']
test__ = test.drop(['uniqueid','bank_account'],1)
train.drop(["uniqueid",'bank_account'],1,inplace=True)

cat_cols=["country","gender_of_respondent","education_level","job_type","marital_status","relationship_with_head"]
TE = TargetEncoder(verbose=0, cols=cat_cols, drop_invariant=False, return_df=True)

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()


In [5]:
#Target encode the categorical features
TE.fit(train,target)
train_ = TE.transform(train)
test_ = TE.transform(test__)

train_[['household_size',"age_of_respondent"]] = ss.fit_transform(train_[['household_size',"age_of_respondent"]])
test_[['household_size',"age_of_respondent"]] = ss.transform(test_[['household_size',"age_of_respondent"]])

In [6]:
#Drop already redundant features.
columns_to_drop = ['year']
train_.drop(columns_to_drop,1,inplace=True)
test_.drop(columns_to_drop,1,inplace=True)

In [7]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_absolute_error
X_train, X_test, y_train, y_test = train_test_split(train_,target,test_size=0.2,stratify=target)
skf = StratifiedKFold(n_splits=7, random_state=10, shuffle=True)

In [8]:
#Define training loop

def training_loop(estimator):
    fold=0
    scores,predictions=[],[]
    for train_index, test_index in skf.split(X_train,y_train):
        fold+=1
        print(f"================Fold:{fold}====================")
        xtrain, xtest = X_train.iloc[train_index],X_train.iloc[test_index]
        ytrain, ytest = y_train.iloc[train_index],y_train.iloc[test_index]
        model = estimator.fit(xtrain,ytrain)
        #predict on train
        pred_train = model.predict(xtrain)
        #Get MAE on training
        score_train = mean_absolute_error(ytrain,pred_train)
        #Predict on test
        pred_test = model.predict(xtest)
        pred_ = model.predict(test_)
        #Get MAE on test
        score_test = mean_absolute_error(ytest,pred_test)
        #Print scores
        print(f"The (train) mean_absolute_error for Fold({fold}): {score_train}")
        print(f"The (test) mean_absolute_error for Fold({fold}): {score_test}\n\n")
        #Store scores and predictions
        scores.append(score_test)
        predictions.append(pred_)
    print(f"Mean MAE score on test set: {np.mean(scores)}")
    return predictions
    

In [9]:
#Train a regularized greedy tree model
from rgf.sklearn import RGFClassifier
rgf_model = RGFClassifier(n_jobs=-1,algorithm='RGF_Opt')
pred_rgf = training_loop(rgf_model)
preds = scipy.stats.mode(np.stack(pred_rgf), axis=0)[0].flatten()
# Mean MAE score on test set: 0.11148367514540215

The (train) mean_absolute_error for Fold(1): 0.10923744575325481
The (test) mean_absolute_error for Fold(1): 0.10859055410933433


The (train) mean_absolute_error for Fold(2): 0.10960942343459393
The (test) mean_absolute_error for Fold(2): 0.114540721457791


The (train) mean_absolute_error for Fold(3): 0.10861748295102294
The (test) mean_absolute_error for Fold(3): 0.11230940870211975


The (train) mean_absolute_error for Fold(4): 0.11003657553778438
The (test) mean_absolute_error for Fold(4): 0.11421130952380952


The (train) mean_absolute_error for Fold(5): 0.10662699150703614
The (test) mean_absolute_error for Fold(5): 0.12313988095238096


The (train) mean_absolute_error for Fold(6): 0.10916868142086665
The (test) mean_absolute_error for Fold(6): 0.10788690476190477


The (train) mean_absolute_error for Fold(7): 0.10929266629471204
The (test) mean_absolute_error for Fold(7): 0.109375


Mean MAE score on test set: 0.11286482564390576


In [10]:
# Create submission DataFrame
submission = pd.DataFrame({"uniqueid": test["uniqueid"] + " x " + test["country"],
                           "bank_account": preds})

In [None]:
# Create submission csv file csv file
submission.to_csv('rgf_opt_submission2.csv', index = False)