In [2144]:
import numpy as np
import pandas as pd

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_score

In [2145]:
def clean(self):
    # Filling NAs
    return self.fillna(self.mean()).fillna('unknown')

In [2146]:
def fit_transform(self):  
    # Encoding all the features as int, and saving an encoder by column
    return self.apply(lambda x: self.dict_encoder[x.name].fit_transform(x))

In [2147]:
def inverse_transform(self):
    # Retriving value before encoding
    return self.apply(lambda x: self.dict_encoder[x.name].inverse_transform(x))

In [2148]:
methods_and_attributes = {
    'clean' : clean,
    'dict_encoder' : defaultdict(LabelEncoder),
    'fit_transform' : fit_transform,
    'inverse_transform' : inverse_transform
}

In [2149]:
for ma in methods_and_attributes.keys():
    setattr(pd.DataFrame, ma, methods_and_attributes[ma])

In [2150]:
# Importing data
test = pd.read_csv('data/test.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])
train = pd.read_csv('data/train.csv', index_col='id', na_values=['\\N', 'Non Renseigne'])

In [2151]:
# For exploration purposes
df_general = pd.concat([train, test], sort=False)

## Removing dangerous columns

In [2152]:
train = train.drop(['STRUCTURE PRESCRIPTRICE', 'year', 'month'], axis=1)

In [2153]:
test = test.drop(['STRUCTURE PRESCRIPTRICE', 'year', 'month'], axis=1)

## Filling NaNs

### Some NaNs are just zeros.

In [2154]:
def fill_stupid_nans(df):
    cols_int_float = ['gain_mediation', 'crd_decouvert', 'crd_rac', 'crd_autres', 'IMPAYES_DEBUT', 'crd_immo',\
            'crd_renouv', 'crd_amort']
    cols_str = ['cat_impayes']
    
    for col in cols_int_float:
        df[col] = df[col].fillna(0)
    
    for col in cols_str:
        df[col] = df[col].fillna('0')
        
    return df

In [2155]:
train = fill_stupid_nans(train)

In [2156]:
test = fill_stupid_nans(test)

### Regression to guess age

In [2157]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

Cleaning

In [2158]:
reg_age = LinearRegression(normalize=True)
# from sklearn.ensemble import RandomForestRegressor
# reg_age = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42, n_jobs=4)

In [2159]:
df_age = df_general[['situation', 'PROF', 'pers_a_charge', 'REVENUS', 'age']]

In [2160]:
df_age['pers_a_charge_2'] = df_age['pers_a_charge']**2
df_age['pers_a_charge_3'] = df_age['pers_a_charge']**3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [2161]:
df_age = df_age.dropna()

In [2162]:
y_age = df_age.age

In [2163]:
df_age = pd.get_dummies(df_age.drop('age', axis=1))

In [2164]:
cross_val_score(reg_age, df_age, y_age, cv=5)

array([0.61394061, 0.6357755 , 0.63509313, 0.65926936, 0.64891669])

In [2165]:
X_train, X_test, y_train, y_test = train_test_split(df_age, y_age, test_size=0.33, random_state=42)

In [2166]:
reg_age.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [2167]:
reg_age.predict(X_test)

array([68.3984375, 68.359375 , 44.6953125, ..., 46.0625   , 43.7734375,
       43.90625  ])

Mean absolute error and mean squared error with this technique :

In [2168]:
mean_absolute_error(reg_age.predict(X_test), y_test)

6.439388800523952

In [2169]:
mean_squared_error(reg_age.predict(X_test), y_test)

66.14205918911688

What if we have filled them with the mean ?

In [2170]:
((y_age.mean() - y_age).abs()).mean()

11.49999194449218

In [2171]:
((y_age.mean() - y_age)**2).mean()

187.5272537358028

And with the median ?

In [2172]:
((y_age.median() - y_age).abs()).mean()

11.498517786561266

In [2173]:
((y_age.median() - y_age)**2).mean()

187.52816205533597

CCL : pretty useful, so lets fill the NaNs this way.

Let's fit it on the entire df

In [2174]:
reg_age.fit(df_age, y_age)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

Now let's apply it to the train and test

In [2175]:
def clean_df_age(df):
    df = df[['situation', 'PROF', 'pers_a_charge', 'REVENUS']]
    df['pers_a_charge_2'] = df['pers_a_charge']**2
    df['pers_a_charge_3'] = df['pers_a_charge']**3
    df = df.fillna('Unknown')
    df = pd.get_dummies(df)
    df = df.drop(['PROF_Unknown', 'situation_Unknown'], axis=1)
    return df

On train

In [2176]:
train_age_nans = clean_df_age(train.loc[train.age.isna(),:])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [2177]:
train.loc[train.age.isna(), 'age'] = reg_age.predict(train_age_nans)

On test

In [2178]:
test_age_nans = clean_df_age(test.loc[test.age.isna(), :])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


As there was no pro, the PROF_pro column hasn't been created, thus we have to do it.

In [2179]:
test_age_nans['PROF_pro'] = 0

In [2180]:
test.loc[test.age.isna(), 'age'] = reg_age.predict(test_age_nans)

### Filling tranche_age given age

Fixing tranche_age name problems

In [2181]:
def rename_tranche_age(df):
    df.tranche_age.replace('35-34ans', '25-34ans', inplace=True)
    df.tranche_age.replace('40-44ans', '35-44ans', inplace=True)
    df.tranche_age.replace('75ans_et+', '>75ans', inplace=True)
    return df

In [2182]:
def assign_tranche_age_to_age(x):
    age = float(x['age'])
    
    if age < 25:
        x['tranche_age'] = '<25ans'
    elif 25 <= age < 35:
        x['tranche_age'] = '25-34ans'
    elif 35 <= age < 45:
        x['tranche_age'] = '35-44ans'
    elif 45 <= age < 55:
        x['tranche_age'] = '45-54ans'
    elif 55 <= age < 65:
        x['tranche_age'] = '55-64ans'
    elif 65 <= age < 75:
        x['tranche_age'] = '65-75ans'
    elif age > 75:
        x['tranche_age'] = '>75ans'
        
    return x

In [2183]:
def clean_tranche_age_with_age(df):
    df = rename_tranche_age(df)
    df.loc[df.tranche_age.isna(), ['age', 'tranche_age']] = df.loc[df.tranche_age.isna(), ['age', 'tranche_age']].apply(assign_tranche_age_to_age, axis=1)
    return df

In [2184]:
train = clean_tranche_age_with_age(train)

In [2185]:
test = clean_tranche_age_with_age(test)

### Dealing with column types

In [2186]:
categories = [key for key in dict(train.dtypes) \
              if (dict(train.dtypes)[key] in [object]) \
              and (key not in ['RAV_UC', 'CRD', 'moy_eco_jour']) ]

col_floats = ['RAV_UC', 'CRD', 'moy_eco_jour']

In [2187]:
def clean_col_types(df, is_y=False):
    
    local_categories = categories.copy()
    
    if is_y:
        local_categories.remove('ORIENTATION')
    
    # Replacing the commas by dots
    df.loc[:, col_floats] = df.loc[:, col_floats].apply(lambda x : x.str.replace(',', '.'))
    
    # Selecting types
    df.loc[:, local_categories] = df.loc[:, local_categories].astype('category')
    df.loc[:, col_floats] = df.loc[:, col_floats].astype(float)
    
    return df

In [2188]:
test = clean_col_types(test, is_y=True)

In [2189]:
train = clean_col_types(train)

### Testing random forest

We want to drop nans for the train

In [2190]:
train = train.dropna()

But for the test we have to find something else : we will fill catagorial columns with a new category : unknown, and the numerical ones with their mean.

In [2191]:
# Columns in which we want to add an unknown column (we can't drop nans because it s the test)
cols_nans_unknown = ['PROF', 'NATURE_DIFF', 'cat_moy_eco_jour', 'cat_RAV_UC', 'situation', 'LOGEMENT', 'region', 'cat_credit']

In [2192]:
for elt in cols_nans_unknown:
    test[elt] = test[elt].cat.add_categories(['unknown'])

In [2193]:
test.loc[:,cols_nans_unknown] = test.loc[:,cols_nans_unknown].fillna('unknown')

In [2194]:
test = test.fillna(test.mean())

In [2195]:
category_cols = [key for key in dict(train.dtypes) if (dict(train.dtypes)[key] not in ['float64', 'int64']) \
                 and (key not in ['ORIENTATION'])]

In [2196]:
train[category_cols] = train[category_cols].fit_transform()

In [2197]:
test[category_cols] = test[category_cols].fit_transform()

No we can use our model.

#### RandomForestClassifier

In [2198]:
# cv : 0.5752690300946961
# local_test : 0.5907241659886087
#train = train[[col for col in train.columns if not 'cat_' in col]]


# train = train[[col for col in train.columns if not 'cat_' in col]]

In [2199]:
train.ORIENTATION.value_counts(normalize=True)

Surendettement                   0.414654
Accompagnement                   0.405699
Aucune                           0.087110
Mediation                        0.084396
Autres Procédures Collectives    0.007056
Microcredit                      0.001085
Name: ORIENTATION, dtype: float64

Removing *Autres Procédures Collectives* and *Microcredit* because they are very badly predicted with the following method.

In [2200]:
# train = train.loc[~train.ORIENTATION.isin(['Autres Procédures Collectives', 'Microcredit']),:]
# train.ORIENTATION = train.ORIENTATION.cat.remove_categories(['Autres Procédures Collectives', 'Microcredit'])

In [2201]:
train.columns

Index(['PLATEFORME', 'region', 'ORIENTATION', 'NATURE_DIFF', 'age',
       'tranche_age', 'situation', 'adulte_foyer', 'pers_a_charge', 'PROF',
       'LOGEMENT', 'REVENUS', 'cat_rev', 'CHARGES', 'cat_charges', 'CREDITS',
       'cat_credit', 'IMPAYES_DEBUT', 'cat_impayes', 'RAV_ouverture',
       'cat_RAV_ouverture', 'RAV_UC', 'cat_RAV_UC', 'nb_credits', 'CRD',
       'gain_mediation', 'nb_amort', 'crd_amort', 'nb_renouv', 'crd_renouv',
       'nb_immo', 'crd_immo', 'nb_rac', 'crd_rac', 'nb_autres', 'crd_autres',
       'nb_decouvert', 'crd_decouvert', 'moy_eco_jour', 'cat_moy_eco_jour'],
      dtype='object')

In [2202]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

In [2203]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features=5, criterion='gini', random_state=42, n_jobs=4)

In [2204]:
moc = MultiOutputClassifier(rf)

In [2205]:
# orientation_cols = [col for col in dummified_train if col.startswith('ORIENTATION')]

In [2206]:
# Creating local train and test
# local_X_train, local_X_test, local_y_train, local_y_test = \
#     train_test_split(dummified_train.drop(orientation_cols, axis=1),\
#                      dummified_train.loc[:, orientation_cols], \
#                      test_size=1/3, \
#                      random_state=42)

local_X_train, local_X_test, local_y_train, local_y_test = \
    train_test_split(train.drop('ORIENTATION', axis=1),\
                     train.loc[:, 'ORIENTATION'], \
                     test_size=1/3, \
                     random_state=42)

X_train_first, X_train_second, y_train_first, y_train_second = \
    train_test_split(local_X_train,\
                     local_y_train, \
                     test_size=1/3, \
                     random_state=42)

Training our MoC on the first part of the local train.

In [2207]:
moc.fit(X_train_first, pd.get_dummies(y_train_first))

MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=4,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_jobs=None)

Predicting on the second part of the local train : crating a dataframe with the probas.

In [2208]:
df_probas = pd.DataFrame()
for i, elt in enumerate(moc.predict_proba(X_train_second)):
    col_name = pd.get_dummies(y_train_first).columns[i]
    buff_df = pd.DataFrame(columns=[f"{col_name}_true", f"{col_name}_false"], data=elt, index=X_train_second.index)
    df_probas = pd.concat([df_probas, buff_df], axis=1)

In [2209]:
df_probas = df_probas[[col for col in df_probas if not 'false' in col]]

In [2210]:
df_probas.head()

Unnamed: 0_level_0,Accompagnement_true,Aucune_true,Autres Procédures Collectives_true,Mediation_true,Microcredit_true,Surendettement_true
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
653,0.775431,0.769165,0.997306,0.986333,1.0,0.442875
3180,0.273706,0.949968,0.999997,0.933689,1.0,0.912111
2194,0.240672,0.980865,0.999834,0.963826,1.0,0.763275
1035,0.842103,0.740976,0.997868,0.963945,1.0,0.345367
2550,0.789229,0.935038,0.996,0.893062,1.0,0.42256


Let's add some columns to this dataframe

In [2211]:
for col in df_probas.columns:
    df_probas[f"{col}_log"] = np.log(df_probas[col])
    df_probas[f"{col}**2"] = df_probas[col]**2
    df_probas[f"{col}**3"] = df_probas[col]**3
    df_probas["acc_sur_by_others"] = (df_probas['Accompagnement_true']*df_probas['Surendettement_true'])/(df_probas['Aucune_true']*df_probas['Autres Procédures Collectives_true']*df_probas['Mediation_true']*df_probas['Microcredit_true'])


In [2212]:
df_probas.head()

Unnamed: 0_level_0,Accompagnement_true,Aucune_true,Autres Procédures Collectives_true,Mediation_true,Microcredit_true,Surendettement_true,Accompagnement_true_log,Accompagnement_true**2,Accompagnement_true**3,acc_sur_by_others,...,Autres Procédures Collectives_true**3,Mediation_true_log,Mediation_true**2,Mediation_true**3,Microcredit_true_log,Microcredit_true**2,Microcredit_true**3,Surendettement_true_log,Surendettement_true**2,Surendettement_true**3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
653,0.775431,0.769165,0.997306,0.986333,1.0,0.442875,-0.254336,0.601293,0.466261,0.453892,...,0.991941,-0.013761,0.972853,0.959556,0.0,1.0,1.0,-0.814468,0.196138,0.086865
3180,0.273706,0.949968,0.999997,0.933689,1.0,0.912111,-1.2957,0.074915,0.020505,0.281464,...,0.99999,-0.068612,0.871775,0.813967,0.0,1.0,1.0,-0.091994,0.831946,0.758827
2194,0.240672,0.980865,0.999834,0.963826,1.0,0.763275,-1.42432,0.057923,0.01394,0.194344,...,0.999502,-0.036845,0.92896,0.895356,0.0,1.0,1.0,-0.270136,0.582589,0.444676
1035,0.842103,0.740976,0.997868,0.963945,1.0,0.345367,-0.171853,0.709138,0.597167,0.408053,...,0.993617,-0.036721,0.929191,0.895689,0.0,1.0,1.0,-1.063147,0.119279,0.041195
2550,0.789229,0.935038,0.996,0.893062,1.0,0.42256,-0.236699,0.622883,0.491597,0.400978,...,0.988048,-0.113099,0.797559,0.71227,0.0,1.0,1.0,-0.861425,0.178557,0.075451


Training a second model on top of this dataframe

In [2213]:
from sklearn.linear_model import LogisticRegression

In [2214]:
# test : 0.5939788445890968
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=100, random_state=42)

# rf_second = RandomForestClassifier(n_estimators=1000, max_depth=10, max_features=5, criterion='gini', random_state=42, n_jobs=4)

In [2215]:
clf = lr

In [2216]:
clf.fit(df_probas, y_train_second)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Now lets test the result with our local test

In [2217]:
df_probas_test = pd.DataFrame()
for i, elt in enumerate(moc.predict_proba(local_X_test)):
    col_name = pd.get_dummies(y_train_first).columns[i]
    buff_df = pd.DataFrame(columns=[f"{col_name}_true", f"{col_name}_false"], data=elt, index=local_X_test.index)
    df_probas_test = pd.concat([df_probas_test, buff_df], axis=1)

In [2218]:
df_probas_test = df_probas_test[[col for col in df_probas_test if not 'false' in col]]

In [2219]:
df_probas_test.head()

Unnamed: 0_level_0,Accompagnement_true,Aucune_true,Autres Procédures Collectives_true,Mediation_true,Microcredit_true,Surendettement_true
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2756,0.466794,0.974184,0.999947,0.997417,1.0,0.564992
3308,0.673917,0.978954,0.997933,0.997374,1.0,0.317441
1363,0.769034,0.888748,0.989976,0.973955,0.999,0.30764
6207,0.693504,0.899728,0.888,0.588551,1.0,0.619107
4434,0.572944,0.970149,0.991995,0.999655,1.0,0.500824


In [2220]:
for col in df_probas_test.columns:
    df_probas_test[f"{col}_log"] = np.log(df_probas_test[col])
    df_probas_test[f"{col}**2"] = df_probas_test[col]**2
    df_probas_test[f"{col}**3"] = df_probas_test[col]**3
    df_probas_test["acc_sur_by_others"] = (df_probas_test['Accompagnement_true']*df_probas_test['Surendettement_true'])/(df_probas_test['Aucune_true']*df_probas_test['Autres Procédures Collectives_true']*df_probas_test['Mediation_true']*df_probas_test['Microcredit_true'])

In [2221]:
clf.score(df_probas_test, local_y_test)

0.5947925142392189

In [2222]:
def pred(row):
    max_col = ('', 0)
    for val in df_probas_test.columns:
        if max_col[1] < row[val]:
            max_col = (val, row[val])
    return max_col[0].replace('_true', '')

In [2223]:
df_probas_test['res'] = df_probas_test.apply(pred, axis=1)

In [2224]:
df_probas_test.head()

Unnamed: 0_level_0,Accompagnement_true,Aucune_true,Autres Procédures Collectives_true,Mediation_true,Microcredit_true,Surendettement_true,Accompagnement_true_log,Accompagnement_true**2,Accompagnement_true**3,acc_sur_by_others,...,Mediation_true_log,Mediation_true**2,Mediation_true**3,Microcredit_true_log,Microcredit_true**2,Microcredit_true**3,Surendettement_true_log,Surendettement_true**2,Surendettement_true**3,res
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2756,0.466794,0.974184,0.999947,0.997417,1.0,0.564992,-0.761867,0.217897,0.101713,0.271439,...,-0.002586,0.994841,0.992271,0.0,1.0,1.0,-0.570944,0.319216,0.180354,Microcredit
3308,0.673917,0.978954,0.997933,0.997374,1.0,0.317441,-0.394649,0.454164,0.306069,0.219557,...,-0.002629,0.994755,0.992143,0.0,1.0,1.0,-1.147464,0.100769,0.031988,Microcredit
1363,0.769034,0.888748,0.989976,0.973955,0.999,0.30764,-0.26262,0.591413,0.454817,0.276364,...,-0.02639,0.948588,0.923882,-0.001001,0.998001,0.997003,-1.178825,0.094642,0.029116,Microcredit
6207,0.693504,0.899728,0.888,0.588551,1.0,0.619107,-0.365999,0.480947,0.333539,0.913074,...,-0.530092,0.346392,0.203869,0.0,1.0,1.0,-0.479477,0.383293,0.2373,Microcredit
4434,0.572944,0.970149,0.991995,0.999655,1.0,0.500824,-0.556968,0.328265,0.188077,0.298263,...,-0.000345,0.999311,0.998967,0.0,1.0,1.0,-0.691501,0.250824,0.125619,Microcredit


In [2225]:
df_probas_test.sort_index(inplace=True)
local_y_test.sort_index(inplace=True)
(df_probas_test.res == local_y_test).sum()/len(local_y_test)

0.0

Generating results

In [2232]:
df_probas_test = pd.DataFrame()
for i, elt in enumerate(moc.predict_proba(test)):
    col_name = pd.get_dummies(y_train_first).columns[i]
    buff_df = pd.DataFrame(columns=[f"{col_name}_true", f"{col_name}_false"], data=elt, index=test.index)
    df_probas_test = pd.concat([df_probas_test, buff_df], axis=1)

In [2237]:
df_probas_test = df_probas_test[[col for col in df_probas_test if not 'false' in col]]

In [None]:
for col in df_probas_test.columns:
    df_probas_test[f"{col}_log"] = np.log(df_probas_test[col])
    df_probas_test[f"{col}**2"] = df_probas_test[col]**2
    df_probas_test[f"{col}**3"] = df_probas_test[col]**3
    df_probas_test["acc_sur_by_others"] = (df_probas_test['Accompagnement_true']*df_probas_test['Surendettement_true'])/(df_probas_test['Aucune_true']*df_probas_test['Autres Procédures Collectives_true']*df_probas_test['Mediation_true']*df_probas_test['Microcredit_true'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [2235]:
df_res = pd.DataFrame(index=test.index, data={'ORIENTATION':clf.predict(df_probas_test)})

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [2227]:
# df_res.to_csv('submissions/submission_9.csv')