In [1]:
import numpy as np
import pandas as pd

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score

In [2]:
def clean(self):
    # Filling NAs
    return self.fillna(self.mean()).fillna('unknown')

In [3]:
def fit_transform(self):  
    # Encoding all the features as int, and saving an encoder by column
    return self.apply(lambda x: self.dict_encoder[x.name].fit_transform(x))

In [4]:
def inverse_transform(self):
    # Retriving value before encoding
    return self.apply(lambda x: self.dict_encoder[x.name].inverse_transform(x))

In [5]:
methods_and_attributes = {
    'clean' : clean,
    'dict_encoder' : defaultdict(LabelEncoder),
    'fit_transform' : fit_transform,
    'inverse_transform' : inverse_transform
}

In [6]:
for ma in methods_and_attributes.keys():
    setattr(pd.DataFrame, ma, methods_and_attributes[ma])

In [7]:
# Importing data
test = pd.read_csv('data/test.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])
train = pd.read_csv('data/train.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])

In [8]:
# For exploration purposes
df_general = pd.concat([train, test], sort=False)

## Removing dangerous columns

In [9]:
train = train.drop(['STRUCTURE PRESCRIPTRICE', 'year', 'month'], axis=1)

In [10]:
test = test.drop(['STRUCTURE PRESCRIPTRICE', 'year', 'month'], axis=1)

## Filling NaNs

### Some NaNs are just zeros.

In [11]:
def fill_stupid_nans(df):
    cols_int_float = ['gain_mediation', 'crd_decouvert', 'crd_rac', 'crd_autres', 'IMPAYES_DEBUT', 'crd_immo',\
            'crd_renouv', 'crd_amort']
    cols_str = ['cat_impayes']
    
    for col in cols_int_float:
        df[col] = df[col].fillna(0)
    
    for col in cols_str:
        df[col] = df[col].fillna('0')
        
    return df

In [12]:
train = fill_stupid_nans(train)

In [13]:
test = fill_stupid_nans(test)

### Regression to guess age

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

Cleaning

In [15]:
reg_age = LinearRegression(normalize=True)
# from sklearn.ensemble import RandomForestRegressor
# reg_age = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42, n_jobs=4)

In [16]:
df_age = df_general[['situation', 'PROF', 'pers_a_charge', 'REVENUS', 'age']]

In [17]:
df_age['pers_a_charge_2'] = df_age['pers_a_charge']**2
df_age['pers_a_charge_3'] = df_age['pers_a_charge']**3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
df_age = df_age.dropna()

In [19]:
y_age = df_age.age

In [20]:
df_age = pd.get_dummies(df_age.drop('age', axis=1))

In [21]:
cross_val_score(reg_age, df_age, y_age, cv=5)

array([0.61394061, 0.6357755 , 0.63509313, 0.65926936, 0.64891669])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_age, y_age, test_size=0.33, random_state=42)

In [23]:
reg_age.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [24]:
reg_age.predict(X_test)

array([68.3984375, 68.359375 , 44.6953125, ..., 46.0625   , 43.7734375,
       43.90625  ])

Mean absolute error and mean squared error with this technique :

In [25]:
mean_absolute_error(reg_age.predict(X_test), y_test)

6.439388800523952

In [26]:
mean_squared_error(reg_age.predict(X_test), y_test)

66.14205918911688

What if we have filled them with the mean ?

In [27]:
((y_age.mean() - y_age).abs()).mean()

11.49999194449218

In [28]:
((y_age.mean() - y_age)**2).mean()

187.5272537358028

And with the median ?

In [29]:
((y_age.median() - y_age).abs()).mean()

11.498517786561266

In [30]:
((y_age.median() - y_age)**2).mean()

187.52816205533597

CCL : pretty useful, so lets fill the NaNs this way.

Let's fit it on the entire df

In [31]:
reg_age.fit(df_age, y_age)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

Now let's apply it to the train and test

In [32]:
def clean_df_age(df):
    df = df[['situation', 'PROF', 'pers_a_charge', 'REVENUS']]
    df['pers_a_charge_2'] = df['pers_a_charge']**2
    df['pers_a_charge_3'] = df['pers_a_charge']**3
    df = df.fillna('Unknown')
    df = pd.get_dummies(df)
    df = df.drop(['PROF_Unknown', 'situation_Unknown'], axis=1)
    return df

On train

In [33]:
train_age_nans = clean_df_age(train.loc[train.age.isna(),:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [34]:
train.loc[train.age.isna(), 'age'] = reg_age.predict(train_age_nans)

On test

In [35]:
test_age_nans = clean_df_age(test.loc[test.age.isna(), :])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


As there was no pro, the PROF_pro column hasn't been created, thus we have to do it.

In [36]:
test_age_nans['PROF_pro'] = 0

In [37]:
test.loc[test.age.isna(), 'age'] = reg_age.predict(test_age_nans)

### Filling tranche_age given age

Fixing tranche_age name problems

In [38]:
def rename_tranche_age(df):
    df.tranche_age.replace('35-34ans', '25-34ans', inplace=True)
    df.tranche_age.replace('40-44ans', '35-44ans', inplace=True)
    df.tranche_age.replace('75ans_et+', '>75ans', inplace=True)
    return df

In [39]:
def assign_tranche_age_to_age(x):
    age = float(x['age'])
    
    if age < 25:
        x['tranche_age'] = '<25ans'
    elif 25 <= age < 35:
        x['tranche_age'] = '25-34ans'
    elif 35 <= age < 45:
        x['tranche_age'] = '35-44ans'
    elif 45 <= age < 55:
        x['tranche_age'] = '45-54ans'
    elif 55 <= age < 65:
        x['tranche_age'] = '55-64ans'
    elif 65 <= age < 75:
        x['tranche_age'] = '65-75ans'
    elif age > 75:
        x['tranche_age'] = '>75ans'
        
    return x

In [40]:
def clean_tranche_age_with_age(df):
    df = rename_tranche_age(df)
    df.loc[df.tranche_age.isna(), ['age', 'tranche_age']] = df.loc[df.tranche_age.isna(), ['age', 'tranche_age']].apply(assign_tranche_age_to_age, axis=1)
    return df

In [41]:
train = clean_tranche_age_with_age(train)

In [42]:
test = clean_tranche_age_with_age(test)

### Dealing with column types

In [43]:
categories = [key for key in dict(train.dtypes) \
              if (dict(train.dtypes)[key] in [object]) \
              and (key not in ['RAV_UC', 'CRD', 'moy_eco_jour']) ]

col_floats = ['RAV_UC', 'CRD', 'moy_eco_jour']

In [44]:
def clean_col_types(df, is_y=False):
    
    local_categories = categories.copy()
    
    if is_y:
        local_categories.remove('ORIENTATION')
    
    # Replacing the commas by dots
    df.loc[:, col_floats] = df.loc[:, col_floats].apply(lambda x : x.str.replace(',', '.'))
    
    # Selecting types
    df.loc[:, local_categories] = df.loc[:, local_categories].astype('category')
    df.loc[:, col_floats] = df.loc[:, col_floats].astype(float)
    
    return df

In [45]:
test = clean_col_types(test, is_y=True)

In [46]:
train = clean_col_types(train)

### Testing random forest

We want to drop nans for the train

In [47]:
train = train.dropna()

But for the test we have to find something else : we will fill catagorial columns with a new category : unknown, and the numerical ones with their mean.

In [48]:
# Columns in which we want to add an unknown column (we can't drop nans because it s the test)
cols_nans_unknown = ['PROF', 'NATURE_DIFF', 'cat_moy_eco_jour', 'cat_RAV_UC', 'situation', 'LOGEMENT', 'region', 'cat_credit']

In [49]:
for elt in cols_nans_unknown:
    test[elt] = test[elt].cat.add_categories(['unknown'])

In [50]:
test.loc[:,cols_nans_unknown] = test.loc[:,cols_nans_unknown].fillna('unknown')

In [51]:
test = test.fillna(test.mean())

No we can use our model.

In [52]:
# train = train.fit_transform()

In [53]:
category_cols = [key for key in dict(train.dtypes) if (dict(train.dtypes)[key] not in ['float64', 'int64']) \
                 and (key not in ['ORIENTATION'])]

In [54]:
train[category_cols] = train[category_cols].fit_transform()

In [55]:
test[category_cols] = test[category_cols].fit_transform()

#### RandomForestClassifier

In [56]:
from sklearn.ensemble import RandomForestClassifier

**Let's try to predict if one element is in ['Surendettement', 'Accompagnement'] or in the other possible outputs**

In [57]:
def merge_cat(x):
    return 1 if x['ORIENTATION'] in ['Surendettement', 'Accompagnement'] else 0

In [58]:
train['is_S_or_Acc'] = train.apply(merge_cat, axis=1)

In [59]:
rf_first_layer = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features=5, criterion='gini', random_state=42, n_jobs=4)
# xb = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01, random_state=42, n_jobs=4)

In [60]:
local_X_train, local_X_test, local_y_train, local_y_test = \
    train_test_split(train.drop(['ORIENTATION', 'is_S_or_Acc'], axis=1),\
                     train.loc[:, 'is_S_or_Acc'], \
                     test_size=1/3, \
                     random_state=42)

In [61]:
cross_val_score(rf_first_layer, local_X_train, local_y_train, cv=5, n_jobs=4).mean()

0.8289964609359715

In [62]:
rf_first_layer.fit(local_X_train, local_y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [63]:
rf_first_layer.score(local_X_test, local_y_test)

0.8307567127746135

Note : What are the most important features here ?

In [64]:
rf_first_layer.feature_importances_.argmax()

2

In [65]:
rf_first_layer.feature_importances_[2]

0.08903422452121121

In [66]:
local_X_train.columns[2]

'NATURE_DIFF'

**Now, we create two dataframes based on those predictions. One with the supposed 'Surendettement' and 'Accompagnement', and the other one with what's left.**

In [67]:
local_X_train['res_step_1'] = rf_first_layer.predict(local_X_train)

Concatenating local_X_train and local_y_train based on whether or not we predicted the row is a 'Surendettement' or a 'Accompagnement'

In [68]:
df_S_or_Acc = local_X_train.loc[local_X_train.res_step_1 == True, :]
df_S_or_Acc = pd.concat([df_S_or_Acc, local_y_train.loc[df_S_or_Acc.index & local_y_train.index]], axis=1)

Adding the ORIENTATION column from train.

In [69]:
df_S_or_Acc = pd.concat([df_S_or_Acc, train.loc[train.index & df_S_or_Acc.index, 'ORIENTATION']], axis=1)

Create a dataframe with what is left.

In [70]:
df_not_S_or_Acc = local_X_train.loc[local_X_train.res_step_1 == False, :]
df_not_S_or_Acc = pd.concat([df_not_S_or_Acc, local_y_train.loc[df_not_S_or_Acc.index & local_y_train.index]], axis=1)

Adding the ORIENTATION column from train.

In [71]:
df_not_S_or_Acc = pd.concat([df_not_S_or_Acc, train.loc[train.index & df_not_S_or_Acc.index, 'ORIENTATION']], axis=1)

**Training a random forest on each of those two dataframes**

In [72]:
rf_S_or_Acc = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features=5, criterion='gini', random_state=42, n_jobs=4)

In [73]:
S_or_Acc_X_train, S_or_Acc_X_test, S_or_Acc_y_train, S_or_Acc_y_test = \
    train_test_split(df_S_or_Acc.drop(['ORIENTATION', 'is_S_or_Acc', 'res_step_1'], axis=1),\
                     df_S_or_Acc.loc[:, 'ORIENTATION'], \
                     test_size=1/3, \
                     random_state=42)

In [74]:
cross_val_score(rf_S_or_Acc, S_or_Acc_X_train, S_or_Acc_y_train, cv=5, n_jobs=4).mean()

0.6126845902499234

In [75]:
rf_S_or_Acc.fit(S_or_Acc_X_train, S_or_Acc_y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [76]:
rf_S_or_Acc.score(S_or_Acc_X_test, S_or_Acc_y_test)

0.5962815405046481

Nice !

Now we want to do the same on the second dataframe.

In [77]:
rf_not_S_or_Acc = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features=5, criterion='gini', random_state=42, n_jobs=4)

In [78]:
not_S_or_Acc_X_train, not_S_or_Acc_X_test, not_S_or_Acc_y_train, not_S_or_Acc_y_test = \
    train_test_split(df_not_S_or_Acc.drop(['ORIENTATION', 'is_S_or_Acc', 'res_step_1'], axis=1),\
                     df_not_S_or_Acc.loc[:, 'ORIENTATION'], \
                     test_size=1/3, \
                     random_state=42)

In [79]:
not_S_or_Acc_X_train.columns

Index(['PLATEFORME', 'region', 'NATURE_DIFF', 'age', 'tranche_age',
       'situation', 'adulte_foyer', 'pers_a_charge', 'PROF', 'LOGEMENT',
       'REVENUS', 'cat_rev', 'CHARGES', 'cat_charges', 'CREDITS', 'cat_credit',
       'IMPAYES_DEBUT', 'cat_impayes', 'RAV_ouverture', 'cat_RAV_ouverture',
       'RAV_UC', 'cat_RAV_UC', 'nb_credits', 'CRD', 'gain_mediation',
       'nb_amort', 'crd_amort', 'nb_renouv', 'crd_renouv', 'nb_immo',
       'crd_immo', 'nb_rac', 'crd_rac', 'nb_autres', 'crd_autres',
       'nb_decouvert', 'crd_decouvert', 'moy_eco_jour', 'cat_moy_eco_jour'],
      dtype='object')

In [80]:
cross_val_score(rf_not_S_or_Acc, not_S_or_Acc_X_train, not_S_or_Acc_y_train, cv=5, n_jobs=4).mean()



0.7416666666666667

In [81]:
rf_not_S_or_Acc.fit(not_S_or_Acc_X_train, not_S_or_Acc_y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [82]:
rf_not_S_or_Acc.score(not_S_or_Acc_X_test, not_S_or_Acc_y_test)

0.7424242424242424

OK !

Now, let's fit both of those models on their entire sub dataframe

In [83]:
rf_S_or_Acc.fit(df_S_or_Acc.drop(['ORIENTATION', 'is_S_or_Acc', 'res_step_1'], axis=1), df_S_or_Acc.loc[:, 'ORIENTATION'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [84]:
rf_not_S_or_Acc.fit(df_not_S_or_Acc.drop(['ORIENTATION', 'is_S_or_Acc', 'res_step_1'], axis=1), df_not_S_or_Acc.loc[:, 'ORIENTATION'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

**Now we can make predictions on our local test**

First prediction : ('Surendettement' or 'Accompagnement') or anything else ?

In [85]:
rf_first_layer.predict(local_X_test)

array([1, 1, 1, ..., 1, 1, 1])

In [86]:
local_X_test['res_step_1'] = rf_first_layer.predict(local_X_test)

Generating the two dataframe we'll use for the second layer.

In [87]:
df_S_or_Acc_local_test = local_X_test.loc[local_X_test.res_step_1 == True, :]
df_S_or_Acc_local_test = pd.concat([df_S_or_Acc_local_test, local_y_test.loc[df_S_or_Acc_local_test.index & local_y_test.index]], axis=1)
df_S_or_Acc_local_test.drop(['res_step_1', 'is_S_or_Acc'], axis=1, inplace=True)

In [88]:
df_not_S_or_Acc_local_test = local_X_test.loc[local_X_test.res_step_1 == False, :]
df_not_S_or_Acc_local_test = pd.concat([df_not_S_or_Acc_local_test, local_y_test.loc[df_not_S_or_Acc_local_test.index & local_y_test.index]], axis=1)
df_not_S_or_Acc_local_test.drop(['res_step_1', 'is_S_or_Acc'], axis=1, inplace=True)

Making the two second level predictions

In [89]:
df_S_or_Acc_local_test['predict'] = rf_S_or_Acc.predict(df_S_or_Acc_local_test)

In [90]:
df_not_S_or_Acc_local_test['predict'] = rf_not_S_or_Acc.predict(df_not_S_or_Acc_local_test)

In [91]:
df_local_res = pd.concat([df_S_or_Acc_local_test, df_not_S_or_Acc_local_test], axis=0)['predict']

In [92]:
df_local_res = pd.concat([df_local_res, train.loc[train.index & df_local_res.index, 'ORIENTATION']], axis=1)

In [93]:
(df_local_res.predict == df_local_res.ORIENTATION).sum()/df_local_res.shape[0]

0.5907241659886087

Well, I have to say I am a little bit sad after all this work ...

**But still, let's try on the test part ...**

By the way, if I need to do this one more time, I'll do a function !

In [94]:
test['res_step_1'] = rf_first_layer.predict(test)

In [95]:
df_S_or_Acc_test = test.loc[test.res_step_1 == True, :]
df_S_or_Acc_test = df_S_or_Acc_test.drop('res_step_1', axis=1)

In [96]:
df_not_S_or_Acc_test = test.loc[test.res_step_1 == False, :]
df_not_S_or_Acc_test = df_not_S_or_Acc_test.drop('res_step_1', axis=1)

In [97]:
df_S_or_Acc_test['predict'] = rf_S_or_Acc.predict(df_S_or_Acc_test)

In [98]:
df_not_S_or_Acc_test['predict'] = rf_not_S_or_Acc.predict(df_not_S_or_Acc_test)

In [99]:
df_res = pd.concat([df_S_or_Acc_test, df_not_S_or_Acc_test], axis=0)['predict']

In [100]:
df_res = pd.DataFrame(df_res)

In [101]:
df_res = pd.DataFrame(index=df_res.index, data={'ORIENTATION': df_res.predict})

In [102]:
df_res.to_csv('submissions/submission_8.csv')