# Indebtedness Case Orientation

In [1]:
# Main packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math

# Models
import xgboost as xgb

# Tools
from collections import defaultdict
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder

## Adding methods and attributes to pandas DataFrame

### Methods declarations

In [2]:
def fit_transform(self):  
    # Encoding all the features as int, and saving an encoder by column
    return self.apply(lambda x: self.dict_encoder[x.name].fit_transform(x))

In [3]:
def inverse_transform(self):
    # Retriving value before encoding
    return self.apply(lambda x: self.dict_encoder[x.name].inverse_transform(x))

In [4]:
def clean(self):
    # Filling NAs
    return self.fillna(self.mean()).fillna('unknown')

### Overriting pandas Dataframe

In [5]:
methods_and_attributes = {
    'clean' : clean,
    'dict_encoder' : defaultdict(LabelEncoder),
    'fit_transform' : fit_transform,
    'inverse_transform' : inverse_transform
}

In [6]:
for ma in methods_and_attributes.keys():
    setattr(pd.DataFrame, ma, methods_and_attributes[ma])

## Data processing functions

In [7]:
# Importing data
test = pd.read_csv('data/test.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])
train = pd.read_csv('data/train.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])

Dropping all the columns with more than 80% of NA

In [8]:
# Dropping the cols
train = train.dropna(thresh=0.2*len(train), axis=1)

In [9]:
# Keeping the same columns than train
test = test[[column for column in train.columns if column != 'ORIENTATION']]

In [10]:
# Splitting it into X and y parts
X_train = train.drop('ORIENTATION', axis=1)
y_train = train['ORIENTATION']

In [11]:
# Cleaning dataframes
X_train = X_train.clean()
test = test.clean()

In [12]:
# Encoding dataframes
X_train = X_train.fit_transform()
test = test.fit_transform()

Splitting train to create new dataframes, in order to be able to do local tests.

In [13]:
# Creating local train and test
local_X_train, local_X_test, local_y_train, local_y_test = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [14]:
y_train.value_counts()/len(y_train)

Surendettement                   0.421493
Accompagnement                   0.402884
Mediation                        0.085601
Aucune                           0.080251
Autres Procédures Collectives    0.008607
Microcredit                      0.001163
Name: ORIENTATION, dtype: float64

## Training Part

#### On the local dataframes

Let's try xgboost !

In [22]:
xgb_clf = xgb.XGBClassifier(objective='multi:softprob', n_estimators=100, learning_rate=0.05, n_jobs=4, random_state=42)

In [23]:
parameters = {
        'min_child_weight': [1, 10],
        'gamma': [0.5, 5],
        'subsample': [0.6, 1.0],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 5]
}

In [24]:
clf = GridSearchCV(xgb_clf, parameters, cv=5, verbose=10, n_jobs=8)

In [25]:
clf.fit(local_X_train, local_y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:  4.3min
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:  6.6min
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed: 11.7min
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed: 15.2min
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed: 21.2min
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed: 25.5min
[Parallel(n_jobs=8)]: Done  69 tasks      | elapsed: 33.0min
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed: 39.2min
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed: 45.0min
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed: 55.0min
[Parallel(n_jobs=8)]: Done 129 tasks      | elapsed: 66.1min
[Parallel(n_jobs=8)]: Done 160 out of 160 | elapsed: 81.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='multi:softprob', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'min_child_weight': [1, 10], 'gamma': [0.5, 5], 'subsample': [0.6, 1.0], 'colsample_bytree': [0.6, 1.0], 'max_depth': [3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [28]:
clf.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.5, learning_rate=0.05,
       max_delta_step=0, max_depth=5, min_child_weight=10, missing=None,
       n_estimators=100, n_jobs=4, nthread=None,
       objective='multi:softprob', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1.0)

In [29]:
clf.best_estimator_.score(local_X_test, local_y_test)

0.5651162790697675

In [21]:
# clf.best_estimator_.score(local_X_test, local_y_test)

0.5755813953488372

#### Now, we have to train our model on the entire train dataframe

In [None]:
# clf.best_estimator_.fit(X_train, y_train)

## Generating the results

In [None]:
df_res = pd.DataFrame(index=test.index, data={'ORIENTATION':clf.best_estimator_.predict(test)})

In [None]:
# df_res.to_csv('submissions/submission_3.csv')