# Indebtedness Case Orientation

In [1]:
# Main packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Models
# from sklearn.cluster import KMeans
# from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

# Tools
from collections import defaultdict
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder

## Adding methods and attributes to pandas DataFrame

### Methods declarations

In [2]:
def fit_transform(self):  
    # Encoding all the features as int, and saving an encoder by column
    return self.apply(lambda x: self.dict_encoder[x.name].fit_transform(x))

In [3]:
def inverse_transform(self):
    # Retriving value before encoding
    return self.apply(lambda x: self.dict_encoder[x.name].inverse_transform(x))

In [4]:
def clean(self):
    # Filling NAs
    return self.fillna(self.mean()).fillna('unknown')

### Overriting pandas Dataframe

In [5]:
methods_and_attributes = {
    'clean' : clean,
    'dict_encoder' : defaultdict(LabelEncoder),
    'fit_transform' : fit_transform,
    'inverse_transform' : inverse_transform
}

In [6]:
for ma in methods_and_attributes.keys():
    setattr(pd.DataFrame, ma, methods_and_attributes[ma])

## Data processing functions

In [7]:
# Importing data
test = pd.read_csv('data/test.csv', index_col='id', na_values='\\N')
train = pd.read_csv('data/train.csv', index_col='id', na_values='\\N')

Dropping all the columns with more than 80% of NA

In [8]:
# Number of columns that will be dropped
(train.isna().sum()/len(train) > 0.8).sum()

6

In [9]:
# Dropping the cols
train = train.dropna(thresh=0.2*len(train), axis=1)

In [10]:
# Keeping the same columns than train
test = test[[column for column in train.columns if column != 'ORIENTATION']]

In [11]:
# Splitting it into X and y parts
X_train = train.drop('ORIENTATION', axis=1)
y_train = train['ORIENTATION']

In [12]:
# Cleaning dataframes
X_train = X_train.clean()
test = test.clean()

In [13]:
# Encoding dataframes
X_train = X_train.fit_transform()
test = test.fit_transform()

Splitting train to create new dataframes, in order to be able to do local tests.

In [14]:
# Creating local train and test
local_X_train, local_X_test, local_y_train, local_y_test = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## Training Part

#### On the local dataframes

In [15]:
# corr = train.corr()
# sns.set(rc={'figure.figsize':(15,10)})
# sns.heatmap(corr,
#             xticklabels=corr.columns.values,
#             yticklabels=corr.columns.values)

In [16]:
# indices = np.where((corr > 0.3) | (corr < -0.7))
# indices = [(corr.index[x], corr.columns[y]) for x, y in zip(*indices) \
#     if x != y and x < y and (corr.index[x] == 'ORIENTATION' or corr.index[y] == 'ORIENTATION')]
# indices

Let's try random forest !

In [17]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=2, random_state=42)

What is its cross val score on the local train ?

In [18]:
cross_val_score(rf, local_X_train, local_y_train).mean()



0.5658576452209357

Let's fit it !

In [19]:
rf.fit(local_X_train, local_y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

What is the score on the local test ?

In [20]:
rf.score(local_X_test, local_y_test)

0.5406976744186046

0.54 is the score on our local test

#### Now, we have to train our model on the entire train dataframe

In [21]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

## Generating the results

In [25]:
test.head()

Unnamed: 0_level_0,STRUCTURE PRESCRIPTRICE,PLATEFORME,year,month,region,NATURE_DIFF,age,tranche_age,situation,adulte_foyer,...,nb_amort,crd_amort,nb_renouv,crd_renouv,nb_immo,nb_rac,nb_autres,nb_decouvert,moy_eco_jour,cat_moy_eco_jour
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5119,1,0,6,7,11,5,43,3,6,2,...,0,1254,0,936,1,1,0,0,807,5
2519,30,1,5,2,2,7,21,1,3,2,...,0,1254,1,182,0,0,2,0,2127,3
6320,46,0,7,3,6,21,33,7,3,2,...,1,268,1,149,0,0,1,0,31,5
2841,65,0,5,4,11,7,33,6,2,0,...,1,379,2,741,0,0,0,0,1349,1
7840,9,0,8,0,0,5,54,4,0,0,...,2,202,0,936,0,0,0,0,1533,2


In [22]:
rf.predict(test)

array(['Surendettement', 'Accompagnement', 'Surendettement', ...,
       'Accompagnement', 'Accompagnement', 'Accompagnement'], dtype=object)

In [26]:
df_res = pd.DataFrame({'id':test.index, 'ORIENTATION':rf.predict(test)})

In [28]:
df_res.to_csv('submission_1.csv')