# Indebtedness Case Orientation

In [1]:
# Main packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Models
# from sklearn.cluster import KMeans
# from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

# Tools
from collections import defaultdict
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder

## Adding methods and attributes to pandas DataFrame

### Methods declarations

In [2]:
def fit_transform(self):  
    # Encoding all the features as int, and saving an encoder by column
    return self.apply(lambda x: self.dict_encoder[x.name].fit_transform(x))

In [3]:
def inverse_transform(self):
    # Retriving value before encoding
    return self.apply(lambda x: self.dict_encoder[x.name].inverse_transform(x))

In [4]:
def clean(self):
    # Filling NAs
    return self.fillna(self.mean()).fillna('unknown')

### Overriting pandas Dataframe

In [5]:
methods_and_attributes = {
    'clean' : clean,
    'dict_encoder' : defaultdict(LabelEncoder),
    'fit_transform' : fit_transform,
    'inverse_transform' : inverse_transform
}

In [6]:
for ma in methods_and_attributes.keys():
    setattr(pd.DataFrame, ma, methods_and_attributes[ma])

## Data processing functions

In [7]:
# Importing data
test = pd.read_csv('data/test.csv', index_col='id', na_values='\\N')
train = pd.read_csv('data/train.csv', index_col='id', na_values='\\N')

Dropping all the columns with more than 80% of NA

In [8]:
# Number of columns that will be dropped
(train.isna().sum()/len(train) > 0.8).sum()

6

In [9]:
# Dropping the cols
train = train.dropna(thresh=0.2*len(train), axis=1)

In [10]:
# Keeping the same columns than train
test = test[[column for column in train.columns if column != 'ORIENTATION']]

In [11]:
# Splitting it into X and y parts
X_train = train.drop('ORIENTATION', axis=1)
y_train = train['ORIENTATION']

In [12]:
# Cleaning dataframes
X_train = X_train.clean()
test = test.clean()

In [13]:
# Encoding dataframes
X_train = X_train.fit_transform()
test = test.fit_transform()

Splitting train to create new dataframes, in order to be able to do local tests.

In [14]:
# Creating local train and test
local_X_train, local_X_test, local_y_train, local_y_test = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Training Part

#### On the local dataframes

Let's try random forest !

In [15]:
# 0.5686046511627907 on local tests
# rf = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features=10, criterion='entropy', random_state=42)
rf = RandomForestClassifier(random_state=42)
parameters = {
    'n_estimators' : [100, 500, 1000],
    'max_depth' : [5, 10, 100],
    'max_features' : ['auto', 5, 10],
    'criterion' : ['gini', 'entropy']
}

Thanks to the gridsearch, we'll find the best combination possible amongst those parameters.

In [16]:
clf = GridSearchCV(rf, parameters, cv=5, verbose=10, n_jobs=8)

This command will take some time ...

In [17]:
clf.fit(local_X_train, local_y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    2.5s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    5.6s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    8.1s
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   11.2s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   14.5s
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:   19.4s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:   25.7s
[Parallel(n_jobs=8)]: Done  69 tasks      | elapsed:   34.5s
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed:   42.9s
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed:   54.6s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:  1.1min
[Parallel(n_jobs=8)]: Done 129 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done 165 tasks      | elapsed:  1.7min
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:  2.1min
[Parallel(

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'n_estimators': [100, 500, 1000], 'max_depth': [5, 10, 100], 'max_features': ['auto', 5, 10], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [18]:
clf.best_score_

0.5914510031986042

Well, around 0.6

In [19]:
clf.best_params_

{'criterion': 'gini',
 'max_depth': 100,
 'max_features': 10,
 'n_estimators': 1000}

In [20]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

Let's fit it !

In [21]:
clf.best_estimator_.fit(local_X_train, local_y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

What is the score on the local test ?

In [22]:
clf.best_estimator_.score(local_X_test, local_y_test)

0.55

0.55 is the score on our local test, are we overfitted ?

#### Now, we have to train our model on the entire train dataframe

In [23]:
clf.best_estimator_.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

## Generating the results

In [25]:
df_res = pd.DataFrame(index=test.index, data={'ORIENTATION':clf.best_estimator_.predict(test)})

In [26]:
df_res.to_csv('submissions/submission_3.csv')