In [1]:
import numpy as np
import pandas as pd

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score

In [2]:
def clean(self):
    # Filling NAs
    return self.fillna(self.mean()).fillna('unknown')

In [3]:
def fit_transform(self):  
    # Encoding all the features as int, and saving an encoder by column
    return self.apply(lambda x: self.dict_encoder[x.name].fit_transform(x))

In [4]:
def inverse_transform(self):
    # Retriving value before encoding
    return self.apply(lambda x: self.dict_encoder[x.name].inverse_transform(x))

In [5]:
methods_and_attributes = {
    'clean' : clean,
    'dict_encoder' : defaultdict(LabelEncoder),
    'fit_transform' : fit_transform,
    'inverse_transform' : inverse_transform
}

In [6]:
for ma in methods_and_attributes.keys():
    setattr(pd.DataFrame, ma, methods_and_attributes[ma])

In [7]:
# Importing data
test = pd.read_csv('data/test.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])
train = pd.read_csv('data/train.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])

In [8]:
# For exploration purposes
df_general = pd.concat([train, test], sort=False)

## Removing dangerous columns

In [9]:
train = train.drop(['STRUCTURE PRESCRIPTRICE', 'year', 'month'], axis=1)

In [10]:
test = test.drop(['STRUCTURE PRESCRIPTRICE', 'year', 'month'], axis=1)

## Filling NaNs

### Some NaNs are just zeros.

In [11]:
def fill_stupid_nans(df):
    for col in ['gain_mediation', 'crd_decouvert', 'crd_rac', 'crd_autres', 'cat_impayes', 'IMPAYES_DEBUT', 'crd_immo',\
            'crd_renouv', 'crd_amort']:
        df[col] = df[col].fillna(0)
    return df

In [12]:
train = fill_stupid_nans(train)

In [13]:
test = fill_stupid_nans(test)

### Regression to guess age

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

Cleaning

In [15]:
reg_age = LinearRegression(normalize=True)
# from sklearn.ensemble import RandomForestRegressor
# reg_age = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42, n_jobs=4)

In [16]:
df_age = df_general[['situation', 'PROF', 'pers_a_charge', 'REVENUS', 'age']]

In [17]:
df_age['pers_a_charge_2'] = df_age['pers_a_charge']**2
df_age['pers_a_charge_3'] = df_age['pers_a_charge']**3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [18]:
df_age = df_age.dropna()

In [19]:
y_age = df_age.age

In [20]:
df_age = pd.get_dummies(df_age.drop('age', axis=1))

In [21]:
cross_val_score(reg_age, df_age, y_age, cv=5)

array([0.61394061, 0.6357755 , 0.63509313, 0.65926936, 0.64891669])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_age, y_age, test_size=0.33, random_state=42)

In [23]:
reg_age.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [24]:
reg_age.predict(X_test)

array([68.3984375, 68.359375 , 44.6953125, ..., 46.0625   , 43.7734375,
       43.90625  ])

Mean absolute error and mean squared error with this technique :

In [25]:
mean_absolute_error(reg_age.predict(X_test), y_test)

6.439388800523952

In [26]:
mean_squared_error(reg_age.predict(X_test), y_test)

66.14205918911688

What if we have filled them with the mean ?

In [27]:
((y_age.mean() - y_age).abs()).mean()

11.49999194449218

In [28]:
((y_age.mean() - y_age)**2).mean()

187.5272537358028

And with the median ?

In [29]:
((y_age.median() - y_age).abs()).mean()

11.498517786561266

In [30]:
((y_age.median() - y_age)**2).mean()

187.52816205533597

CCL : pretty useful, so lets fill the NaNs this way.

Let's fit it on the entire df

In [31]:
reg_age.fit(df_age, y_age)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

Now let's apply it to the train and test

In [32]:
def clean_df_age(df):
    df = df[['situation', 'PROF', 'pers_a_charge', 'REVENUS']]
    df['pers_a_charge_2'] = df['pers_a_charge']**2
    df['pers_a_charge_3'] = df['pers_a_charge']**3
    df = df.fillna('Unknown')
    df = pd.get_dummies(df)
    df = df.drop(['PROF_Unknown', 'situation_Unknown'], axis=1)
    return df

On train

In [33]:
train_age_nans = clean_df_age(train.loc[train.age.isna(),:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [34]:
train.loc[train.age.isna(), 'age'] = reg_age.predict(train_age_nans)

On test

In [35]:
test_age_nans = clean_df_age(test.loc[test.age.isna(), :])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


As there was no pro, the PROF_pro column hasn't been created, thus we have to do it.

In [36]:
test_age_nans['PROF_pro'] = 0

In [37]:
test.loc[test.age.isna(), 'age'] = reg_age.predict(test_age_nans)

### Filling tranche_age given age

Fixing tranche_age name problems

In [38]:
def rename_tranche_age(df):
    df.tranche_age.replace('35-34ans', '25-34ans', inplace=True)
    df.tranche_age.replace('40-44ans', '35-44ans', inplace=True)
    df.tranche_age.replace('75ans_et+', '>75ans', inplace=True)
    return df

In [39]:
def assign_tranche_age_to_age(x):
    age = float(x['age'])
    
    if age < 25:
        x['tranche_age'] = '<25ans'
    elif 25 <= age < 35:
        x['tranche_age'] = '25-34ans'
    elif 35 <= age < 45:
        x['tranche_age'] = '35-44ans'
    elif 45 <= age < 55:
        x['tranche_age'] = '45-54ans'
    elif 55 <= age < 65:
        x['tranche_age'] = '55-64ans'
    elif 65 <= age < 75:
        x['tranche_age'] = '65-75ans'
    elif age > 75:
        x['tranche_age'] = '>75ans'
        
    return x

In [40]:
def clean_tranche_age_with_age(df):
    df = rename_tranche_age(df)
    df.loc[df.tranche_age.isna(), ['age', 'tranche_age']] = df.loc[df.tranche_age.isna(), ['age', 'tranche_age']].apply(assign_tranche_age_to_age, axis=1)
    return df

In [41]:
train = clean_tranche_age_with_age(train)

In [42]:
test = clean_tranche_age_with_age(test)

### Dealing with column types

In [43]:
categories = [key for key in dict(train.dtypes) \
              if (dict(train.dtypes)[key] in [object]) \
              and (key not in ['RAV_UC', 'CRD', 'moy_eco_jour']) ]

col_floats = ['RAV_UC', 'CRD', 'moy_eco_jour']

In [44]:
def clean_col_types(df, is_y=False):
    
    local_categories = categories.copy()
    
    if is_y:
        local_categories.remove('ORIENTATION')
    
    # Replacing the commas by dots
    df.loc[:, col_floats] = df.loc[:, col_floats].apply(lambda x : x.str.replace(',', '.'))
    
    # Selecting types
    df.loc[:, local_categories] = df.loc[:, local_categories].astype('category')
    df.loc[:, col_floats] = df.loc[:, col_floats].astype(float)
    
    return df

In [45]:
test = clean_col_types(test, is_y=True)

In [46]:
train = clean_col_types(train)

### Testing random forest

We want to drop nans for the train

In [47]:
train = train.dropna()

But for the test we have to find something else : we will fill catagorial columns with a new category : unknown, and the numerical ones with their mean.

In [48]:
# Columns in which we want to add an unknown column (we can't drop nans because it s the test)
cols_nans_unknown = ['PROF', 'NATURE_DIFF', 'cat_moy_eco_jour', 'cat_RAV_UC', 'situation', 'LOGEMENT', 'region', 'cat_credit']

In [49]:
for elt in cols_nans_unknown:
    test[elt] = test[elt].cat.add_categories(['unknown'])

In [50]:
test.loc[:,cols_nans_unknown] = test.loc[:,cols_nans_unknown].fillna('unknown')

In [51]:
test = test.fillna(test.mean())

Let's now dummify those dataframes so that we can use almost any models on them.

In [52]:
dummified_train = pd.get_dummies(train.loc[:,train.columns.difference(['ORIENTATION'])])
dummified_train['ORIENTATION'] = train.loc[:,'ORIENTATION']

In [53]:
dummified_test = pd.get_dummies(test)

The colomns that are in train but not in test will raise errors when we will want to call predict. Thus, we create them with all the column at 0.

In [54]:
diff_cols = [elt for elt in dummified_train.columns if elt not in dummified_test.columns if elt != 'ORIENTATION']

In [55]:
for col in diff_cols:
    dummified_test[col] = 0

Now the opposite case is also happenig : some columns exists in dummified_test but not in dummified_train. Thus we have to remove them.

In [56]:
diff_cols_inv = [elt for elt in dummified_test.columns if elt not in dummified_train.columns]

In [57]:
dummified_test = dummified_test.drop(diff_cols_inv, axis=1)

No we can use our model.

#### RandomForestClassifier

In [58]:
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

In [59]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=10, criterion='entropy', random_state=42, n_jobs=4)

In [60]:
xb = xgb.XGBClassifier(objective='multi:softprob', n_estimators=100, learning_rate=0.05, n_jobs=4, random_state=42)

In [61]:
# orientation_cols = [col for col in dummified_train if col.startswith('ORIENTATION')]

In [62]:
# Creating local train and test
# local_X_train, local_X_test, local_y_train, local_y_test = \
#     train_test_split(dummified_train.drop(orientation_cols, axis=1),\
#                      dummified_train.loc[:, orientation_cols], \
#                      test_size=1/3, \
#                      random_state=42)

local_X_train, local_X_test, local_y_train, local_y_test = \
    train_test_split(dummified_train.drop('ORIENTATION', axis=1),\
                     dummified_train.loc[:, 'ORIENTATION'], \
                     test_size=1/3, \
                     random_state=42)

In [63]:
# Removing special chars for xgboost
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)


# local train and test
local_X_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in local_X_train.columns.values]
local_X_test.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in local_X_test.columns.values]

# dummified train
dummified_train.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in dummified_train.columns.values]
dummified_test.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in dummified_test.columns.values]

In [64]:
# cross_val_score(xb, local_X_train, local_y_train, cv=5, n_jobs=4).mean()

In [65]:
# xb.fit(local_X_train, local_y_train)

In [66]:
# xb.score(local_X_test, local_y_test)

In [67]:
dummified_train_X = dummified_train.drop('ORIENTATION', axis=1).copy()
dummified_train_y = dummified_train.loc[:, 'ORIENTATION'].copy()

In [68]:
xb.fit(dummified_train_X, dummified_train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='multi:softprob', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

To fix a xgboost error : "features_names_mismatch", we reindex our dummified datasets

In [69]:
dummified_test = dummified_test[dummified_train_X.columns]
# dummified_test = dummified_test.reindex(sorted(dummified_test.columns), axis=1)
# dummified_train_X = dummified_train_X.reindex(sorted(dummified_train_X.columns), axis=1)

In [70]:
df_res = pd.DataFrame(index=dummified_test.index, data={'ORIENTATION':xb.predict(dummified_test)})

In [71]:
df_res.to_csv('submissions/submission_5.csv')