In [1]:
import pandas as pd
import numpy as np

In [2]:
immo = pd.read_csv('../../data/ml/train_immo_100.csv')

In [4]:
immo.sort_values(by=['id_bien','mois']).head(20)

Unnamed: 0,id_bien,valeur,mois,valeur-1,valeur-2,valeur_diff-1-2
0,01004_1_4,0.0,3,215000.0,239000.0,-24000.0
4750,01004_1_4,166590.0,4,0.0,215000.0,-215000.0
9500,01004_1_4,0.0,5,166590.0,0.0,166590.0
14250,01004_1_4,100500.0,6,0.0,166590.0,-166590.0
19000,01004_1_4,150000.0,7,100500.0,0.0,100500.0
23750,01004_1_4,177893.75,8,150000.0,100500.0,49500.0
28500,01004_1_4,188550.0,9,177893.75,150000.0,27893.75
33250,01004_1_4,200000.0,10,188550.0,177893.75,10656.25
38000,01004_1_4,193416.666667,11,200000.0,188550.0,11450.0
42750,01004_1_4,102833.666667,12,193416.666667,200000.0,-6583.333333


In [5]:
immo.shape

(218500, 6)

In [6]:
immo.head()

Unnamed: 0,id_bien,valeur,mois,valeur-1,valeur-2,valeur_diff-1-2
0,01004_1_4,0.0,3,215000.0,239000.0,-24000.0
1,01004_1_5,130555.0,3,217791.75,276000.0,-58208.25
2,01004_2_2,150000.0,3,0.0,108212.5,-108212.5
3,01004_2_3,95975.0,3,167500.0,142000.0,25500.0
4,01033_2_2,84666.666667,3,189000.0,114575.0,74425.0


### Evaluating the result

In [7]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

### Setting a Baseline

In [8]:
from sklearn.metrics import mean_squared_log_error

In [9]:
mean_error = []
for mois in range(30,45):
    train = immo[immo['mois'] < mois]
    val = immo[immo['mois'] == mois]
    
    p = val['valeur-1'].values
    
    error = rmsle(val['valeur'].values, p)
    print('Week %d - Error %.5f' % (mois, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Week 30 - Error 3.20999
Week 31 - Error 2.53821
Week 32 - Error 3.12085
Week 33 - Error 3.20956
Week 34 - Error 2.89642
Week 35 - Error 3.44560
Week 36 - Error 3.22022
Week 37 - Error 2.99002
Week 38 - Error 3.43686
Week 39 - Error 2.89105
Week 40 - Error 2.68034
Week 41 - Error 3.09954
Week 42 - Error 2.64316
Week 43 - Error 1.99355
Week 44 - Error 2.84491
Mean Error = 2.94802


### Creating the Model

In [10]:
immo.head()

Unnamed: 0,id_bien,valeur,mois,valeur-1,valeur-2,valeur_diff-1-2
0,01004_1_4,0.0,3,215000.0,239000.0,-24000.0
1,01004_1_5,130555.0,3,217791.75,276000.0,-58208.25
2,01004_2_2,150000.0,3,0.0,108212.5,-108212.5
3,01004_2_3,95975.0,3,167500.0,142000.0,25500.0
4,01033_2_2,84666.666667,3,189000.0,114575.0,74425.0


In [11]:
from sklearn.ensemble import RandomForestRegressor

mean_error = []
for mois in range(30,45):
    train = immo[immo['mois'] < mois]
    val = immo[immo['mois'] == mois]
    
    xtr, xts = train.drop(['valeur','id_bien'], axis=1), val.drop(['valeur','id_bien'], axis=1)
    ytr, yts = train['valeur'].values, val['valeur'].values
    
    mdl = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0)
    mdl.fit(xtr, ytr)
    
    p = mdl.predict(xts)
    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (mois, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))



Week 30 - Error 2.02722
Week 31 - Error 1.76774
Week 32 - Error 2.78303
Week 33 - Error 1.92791
Week 34 - Error 2.40824
Week 35 - Error 2.76484
Week 36 - Error 1.92475
Week 37 - Error 2.59465
Week 38 - Error 2.65752
Week 39 - Error 1.74889
Week 40 - Error 2.26068
Week 41 - Error 2.37024
Week 42 - Error 1.52298
Week 43 - Error 1.54798
Week 44 - Error 2.61270
Mean Error = 2.19463


In [None]:
from lightgbm import LGBMRegressor

mean_error = []
for mois in range(30,45):
    train = immo[immo['mois'] < mois]
    val = immo[immo['mois'] == mois]
    
    xtr, xts = train.drop(['valeur','id_bien'], axis=1), val.drop(['valeur','id_bien'], axis=1)
    ytr, yts = train['valeur'].values, val['valeur'].values
    
    mdl = LGBMRegressor(n_estimators=1000, learning_rate=0.01)
    mdl.fit(xtr, np.log1p(ytr))
    
    p = np.expm1(mdl.predict(xts))

    
    error = rmsle(yts, p)
    print('Week %d - Error %.5f' % (mois, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))



Week 30 - Error 2.06031


In [356]:
immo = immo.drop(columns=['code_commune', 'nom_commune','type_local','code_type_local','nombre_pieces_principales'])
immo.head()

Unnamed: 0,id_bien,nb_mutation_2014,nb_mutation_2015,nb_mutation_2016,nb_mutation_2017,surface_2014,surface_2015,surface_2016,surface_2017,valeur_2014,valeur_2015,valeur_2016,valeur_2017,pop_2014,pop_2015,pop_2016
0,10268_1_4,15,11,14,12,88,86,94,85,126139,131788,153442,133541,6099,6081,6092
1,10323_1_3,58,53,46,61,66,69,66,71,85131,70899,77799,69112,14617,14808,14783
2,10323_1_4,34,44,46,54,88,89,84,83,115841,100668,97592,111657,14617,14808,14783
3,10323_1_5,18,14,22,23,119,97,104,111,138056,101857,112113,116260,14617,14808,14783
4,10325_1_4,18,11,14,13,91,95,98,93,161920,171600,174245,182324,4178,4294,4409


In [357]:
immo.shape

(5812, 16)

In [358]:
cols = immo.columns.values
cols_to_remove = ["nb_mutation","surface"]

In [359]:
for col_to_remove in cols_to_remove:
    for col in cols:
        remove = col.startswith(col_to_remove)
        if remove:
            immo = immo.drop(columns = [col])
immo.head()

Unnamed: 0,id_bien,valeur_2014,valeur_2015,valeur_2016,valeur_2017,pop_2014,pop_2015,pop_2016
0,10268_1_4,126139,131788,153442,133541,6099,6081,6092
1,10323_1_3,85131,70899,77799,69112,14617,14808,14783
2,10323_1_4,115841,100668,97592,111657,14617,14808,14783
3,10323_1_5,138056,101857,112113,116260,14617,14808,14783
4,10325_1_4,161920,171600,174245,182324,4178,4294,4409


In [360]:
immo.shape

(5812, 8)

### Faire un dataset pour chaque variable

In [361]:
cols_to_extract = ['valeur', 'pop']
cols = immo.columns.values

In [362]:
datasets = {}

for col_to_extract in cols_to_extract:
    extract = ['id_bien']
    for col in cols:
        if col.startswith(col_to_extract):
            extract.append(col)
    
    datasets[col_to_extract] = immo[extract]
    

In [363]:
5812*3

17436

### Transformer chaque dataset en une série temp ( une ligne = une année par produit )

In [364]:
for feature_family, dataset in datasets.items():
    datasets[feature_family] = dataset.melt(id_vars='id_bien', var_name='year', value_name=feature_family) 

In [365]:
datasets['valeur'].sort_values(by='id_bien').head()

Unnamed: 0,id_bien,year,valeur
0,10268_1_4,valeur_2014,126139
5812,10268_1_4,valeur_2015,131788
11624,10268_1_4,valeur_2016,153442
17436,10268_1_4,valeur_2017,133541
1,10323_1_3,valeur_2014,85131


### Filter les années pour que chaque série soit donnée dispo sur la même période ( une série = une variable )

In [366]:
years = ['2014','2015','2016']
for feature_family, dataset in datasets.items():
    dataset.loc[:,'year'] = dataset['year'].replace(feature_family + '_','',regex=True)
    dataset = dataset[dataset.year.isin(years)]
    datasets[feature_family] = dataset

In [367]:
datasets['valeur'].head()

Unnamed: 0,id_bien,year,valeur
0,10268_1_4,2014,126139
1,10323_1_3,2014,85131
2,10323_1_4,2014,115841
3,10323_1_5,2014,138056
4,10325_1_4,2014,161920


### Merger chaque série temporelle en un dataframe

In [418]:
cp_datasets = datasets.copy() 

In [419]:
immo_serie = cp_datasets.popitem()[1]

In [420]:
for dataset in cp_datasets.values():
    immo_serie = pd.merge(immo_serie, dataset, on=['id_bien','year'], how='inner')

In [421]:
immo_serie.head()

Unnamed: 0,id_bien,year,pop,valeur
0,10268_1_4,2014,6099,126139
1,10323_1_3,2014,14617,85131
2,10323_1_4,2014,14617,115841
3,10323_1_5,2014,14617,138056
4,10325_1_4,2014,4178,161920


### Feature enginneering - ajout des variables observées à  t-1  ( lag1)

In [523]:
immo_serie2 = immo_serie.copy()
immo_serie2 = immo_serie2.sort_values(['year','id_bien'])

In [524]:
lag1 = list(map(lambda x: x + "-1",cols_to_extract))

In [525]:
immo_serie2[lag1] = immo_serie2.groupby(['id_bien'])[cols_to_extract].shift()

### Feature enginneering - ajout des variables observées à  t-2  ( lag2 )

In [526]:
lag2 = list(map(lambda x: x + "-2",cols_to_extract))

In [527]:
immo_serie2[lag2] = immo_serie2.groupby(['id_bien'])[cols_to_extract].shift(2)

### Feature enginneering - ajout des différences entre t-1 et -2 (diff1)

In [528]:
diff1 = list(map(lambda x: x + "_diff-1-2",cols_to_extract))

In [529]:
immo_serie2[diff1] = immo_serie2.groupby(['id_bien'])[lag1].diff()

In [530]:
immo_serie2 = immo_serie2.dropna()

In [532]:
immo_serie2.sort_values(by='id_bien').head()

Unnamed: 0,id_bien,year,pop,valeur,valeur-1,pop-1,valeur-2,pop-2,valeur_diff-1-2,pop_diff-1-2
11624,10268_1_4,2016,6092,153442,131788.0,6081.0,126139.0,6099.0,5649.0,-18.0
11625,10323_1_3,2016,14783,77799,70899.0,14808.0,85131.0,14617.0,-14232.0,191.0
11626,10323_1_4,2016,14783,97592,100668.0,14808.0,115841.0,14617.0,-15173.0,191.0
11627,10323_1_5,2016,14783,112113,101857.0,14808.0,138056.0,14617.0,-36199.0,191.0
11628,10325_1_4,2016,4409,174245,171600.0,4294.0,161920.0,4178.0,9680.0,116.0


In [533]:
immo_serie2.shape

(5812, 10)

### Export

In [534]:
immo_serie2.to_csv('../../data/ml/serie_immo_pop.csv',index=False)