In [602]:
import numpy as np
import pandas as pd

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score

In [603]:
def clean(self):
    # Filling NAs
    return self.fillna(self.mean()).fillna('unknown')

In [604]:
def fit_transform(self):  
    # Encoding all the features as int, and saving an encoder by column
    return self.apply(lambda x: self.dict_encoder[x.name].fit_transform(x))

In [605]:
def inverse_transform(self):
    # Retriving value before encoding
    return self.apply(lambda x: self.dict_encoder[x.name].inverse_transform(x))

In [606]:
methods_and_attributes = {
    'clean' : clean,
    'dict_encoder' : defaultdict(LabelEncoder),
    'fit_transform' : fit_transform,
    'inverse_transform' : inverse_transform
}

In [607]:
for ma in methods_and_attributes.keys():
    setattr(pd.DataFrame, ma, methods_and_attributes[ma])

In [608]:
# Importing data
test = pd.read_csv('data/test.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])
train = pd.read_csv('data/train.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])

In [609]:
# For exploration purposes
df_general = pd.concat([train, test], sort=False)

## Removing dangerous columns

In [610]:
train = train.drop(['STRUCTURE PRESCRIPTRICE', 'year', 'month'], axis=1)

In [611]:
test = test.drop(['STRUCTURE PRESCRIPTRICE', 'year', 'month'], axis=1)

## Filling NaNs

### Some NaNs are just zeros.

In [612]:
def fill_stupid_nans(df):
    cols_int_float = ['gain_mediation', 'crd_decouvert', 'crd_rac', 'crd_autres', 'IMPAYES_DEBUT', 'crd_immo',\
            'crd_renouv', 'crd_amort']
    cols_str = ['cat_impayes']
    
    for col in cols_int_float:
        df[col] = df[col].fillna(0)
    
    for col in cols_str:
        df[col] = df[col].fillna('0')
        
    return df

In [613]:
train = fill_stupid_nans(train)

In [614]:
test = fill_stupid_nans(test)

### Regression to guess age

In [615]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

Cleaning

In [616]:
reg_age = LinearRegression(normalize=True)
# from sklearn.ensemble import RandomForestRegressor
# reg_age = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42, n_jobs=4)

In [617]:
df_age = df_general[['situation', 'PROF', 'pers_a_charge', 'REVENUS', 'age']]

In [618]:
df_age['pers_a_charge_2'] = df_age['pers_a_charge']**2
df_age['pers_a_charge_3'] = df_age['pers_a_charge']**3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [619]:
df_age = df_age.dropna()

In [620]:
y_age = df_age.age

In [621]:
df_age = pd.get_dummies(df_age.drop('age', axis=1))

In [622]:
cross_val_score(reg_age, df_age, y_age, cv=5)

array([0.61394061, 0.6357755 , 0.63509313, 0.65926936, 0.64891669])

In [623]:
X_train, X_test, y_train, y_test = train_test_split(df_age, y_age, test_size=0.33, random_state=42)

In [624]:
reg_age.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [625]:
reg_age.predict(X_test)

array([68.3984375, 68.359375 , 44.6953125, ..., 46.0625   , 43.7734375,
       43.90625  ])

Mean absolute error and mean squared error with this technique :

In [626]:
mean_absolute_error(reg_age.predict(X_test), y_test)

6.439388800523952

In [627]:
mean_squared_error(reg_age.predict(X_test), y_test)

66.14205918911688

What if we have filled them with the mean ?

In [628]:
((y_age.mean() - y_age).abs()).mean()

11.49999194449218

In [629]:
((y_age.mean() - y_age)**2).mean()

187.5272537358028

And with the median ?

In [630]:
((y_age.median() - y_age).abs()).mean()

11.498517786561266

In [631]:
((y_age.median() - y_age)**2).mean()

187.52816205533597

CCL : pretty useful, so lets fill the NaNs this way.

Let's fit it on the entire df

In [632]:
reg_age.fit(df_age, y_age)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

Now let's apply it to the train and test

In [633]:
def clean_df_age(df):
    df = df[['situation', 'PROF', 'pers_a_charge', 'REVENUS']]
    df['pers_a_charge_2'] = df['pers_a_charge']**2
    df['pers_a_charge_3'] = df['pers_a_charge']**3
    df = df.fillna('Unknown')
    df = pd.get_dummies(df)
    df = df.drop(['PROF_Unknown', 'situation_Unknown'], axis=1)
    return df

On train

In [634]:
train_age_nans = clean_df_age(train.loc[train.age.isna(),:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [635]:
train.loc[train.age.isna(), 'age'] = reg_age.predict(train_age_nans)

On test

In [636]:
test_age_nans = clean_df_age(test.loc[test.age.isna(), :])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


As there was no pro, the PROF_pro column hasn't been created, thus we have to do it.

In [637]:
test_age_nans['PROF_pro'] = 0

In [638]:
test.loc[test.age.isna(), 'age'] = reg_age.predict(test_age_nans)

### Filling tranche_age given age

Fixing tranche_age name problems

In [639]:
def rename_tranche_age(df):
    df.tranche_age.replace('35-34ans', '25-34ans', inplace=True)
    df.tranche_age.replace('40-44ans', '35-44ans', inplace=True)
    df.tranche_age.replace('75ans_et+', '>75ans', inplace=True)
    return df

In [640]:
def assign_tranche_age_to_age(x):
    age = float(x['age'])
    
    if age < 25:
        x['tranche_age'] = '<25ans'
    elif 25 <= age < 35:
        x['tranche_age'] = '25-34ans'
    elif 35 <= age < 45:
        x['tranche_age'] = '35-44ans'
    elif 45 <= age < 55:
        x['tranche_age'] = '45-54ans'
    elif 55 <= age < 65:
        x['tranche_age'] = '55-64ans'
    elif 65 <= age < 75:
        x['tranche_age'] = '65-75ans'
    elif age > 75:
        x['tranche_age'] = '>75ans'
        
    return x

In [641]:
def clean_tranche_age_with_age(df):
    df = rename_tranche_age(df)
    df.loc[df.tranche_age.isna(), ['age', 'tranche_age']] = df.loc[df.tranche_age.isna(), ['age', 'tranche_age']].apply(assign_tranche_age_to_age, axis=1)
    return df

In [642]:
train = clean_tranche_age_with_age(train)

In [643]:
test = clean_tranche_age_with_age(test)

### Dealing with column types

In [644]:
categories = [key for key in dict(train.dtypes) \
              if (dict(train.dtypes)[key] in [object]) \
              and (key not in ['RAV_UC', 'CRD', 'moy_eco_jour']) ]

col_floats = ['RAV_UC', 'CRD', 'moy_eco_jour']

In [645]:
def clean_col_types(df, is_y=False):
    
    local_categories = categories.copy()
    
    if is_y:
        local_categories.remove('ORIENTATION')
    
    # Replacing the commas by dots
    df.loc[:, col_floats] = df.loc[:, col_floats].apply(lambda x : x.str.replace(',', '.'))
    
    # Selecting types
    df.loc[:, local_categories] = df.loc[:, local_categories].astype('category')
    df.loc[:, col_floats] = df.loc[:, col_floats].astype(float)
    
    return df

In [646]:
test = clean_col_types(test, is_y=True)

In [647]:
train = clean_col_types(train)

### Testing random forest

We want to drop nans for the train

In [648]:
train = train.dropna()

But for the test we have to find something else : we will fill catagorial columns with a new category : unknown, and the numerical ones with their mean.

In [649]:
# Columns in which we want to add an unknown column (we can't drop nans because it s the test)
cols_nans_unknown = ['PROF', 'NATURE_DIFF', 'cat_moy_eco_jour', 'cat_RAV_UC', 'situation', 'LOGEMENT', 'region', 'cat_credit']

In [650]:
for elt in cols_nans_unknown:
    test[elt] = test[elt].cat.add_categories(['unknown'])

In [651]:
test.loc[:,cols_nans_unknown] = test.loc[:,cols_nans_unknown].fillna('unknown')

In [652]:
test = test.fillna(test.mean())

No we can use our model.

In [653]:
# train = train.fit_transform()

In [654]:
category_cols = [key for key in dict(train.dtypes) if (dict(train.dtypes)[key] not in ['float64', 'int64']) \
                 and (key not in ['ORIENTATION'])]

In [655]:
train[category_cols] = train[category_cols].fit_transform()

In [656]:
test[category_cols] = test[category_cols].fit_transform()

#### RandomForestClassifier

In [657]:
train.ORIENTATION.value_counts(normalize=True)

Surendettement                   0.414654
Accompagnement                   0.405699
Aucune                           0.087110
Mediation                        0.084396
Autres Procédures Collectives    0.007056
Microcredit                      0.001085
Name: ORIENTATION, dtype: float64

In [658]:
from sklearn.ensemble import RandomForestClassifier

In [659]:
train.loc[:, 'ORIENTATION'] = train.loc[train.ORIENTATION.isin(['Surendettement', 'Accompagnement']), 'ORIENTATION']
train.dropna(inplace=True)

In [660]:
train.ORIENTATION.value_counts(normalize=True)

Surendettement                   0.505458
Accompagnement                   0.494542
Microcredit                      0.000000
Mediation                        0.000000
Autres Procédures Collectives    0.000000
Aucune                           0.000000
Name: ORIENTATION, dtype: float64

In [661]:
# train.ORIENTATION = train.ORIENTATION.cat.remove_categories(['Microcredit', 'Mediation', 'Autres Procédures Collectives', 'Aucune'])

In [662]:
# train.ORIENTATION.value_counts(normalize=True)

#### RandomForestClassifier

In [663]:
# rf = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features=5, criterion='gini', random_state=42, n_jobs=4)
rf = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features=5, criterion='gini', random_state=42, n_jobs=4)

In [664]:
# orientation_cols = [col for col in dummified_train if col.startswith('ORIENTATION')]

In [665]:
# Creating local train and test
# local_X_train, local_X_test, local_y_train, local_y_test = \
#     train_test_split(dummified_train.drop(orientation_cols, axis=1),\
#                      dummified_train.loc[:, orientation_cols], \
#                      test_size=1/3, \
#                      random_state=42)

local_X_train, local_X_test, local_y_train, local_y_test = \
    train_test_split(train.drop('ORIENTATION', axis=1),\
                     train.loc[:, 'ORIENTATION'], \
                     test_size=1/3, \
                     random_state=42)

In [666]:
cross_val_score(rf, local_X_train, local_y_train, cv=5, n_jobs=4).mean()

0.7007444168734491

In [667]:
rf.fit(local_X_train, local_y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [668]:
rf.score(local_X_test, local_y_test)

0.6726190476190477

In [399]:
df_res = pd.DataFrame(index=test.index, data={'ORIENTATION':rf.predict(test)})

In [400]:
df_res.to_csv('submissions/submission_7.csv')