In [66]:
import pandas as pd 
import numpy as np
from sklearn import linear_model
from scipy import stats 

X_train = pd.read_csv('../data/X_train_J01Z4CN.csv') # Adresse relative, on import le set d'entrainement
y_train = pd.read_csv('../data/y_train_OXxrJt1.csv') # y entrainement

# df_short = df[['price', 'size', 'nb_photos']]
# df_short = df_short.dropna()
# y = np.log(df_short['price'])
# X = df_short[['size', 'nb_photos']]

# On sélectionne supprime les variables ayant des données manquantes ou à recoder
X_train = X_train.drop(['approximate_latitude', 'energy_performance_category', 'ghg_value', 'ghg_category',
                       'exposition', 'approximate_longitude', 'nb_bathrooms', 'property_type', 'last_floor',
                       'city', 'postal_code','land_size','energy_performance_value','upper_floors', 'floor'], axis =1)

# On merge sur la variable id_annonce
X_train = X_train.merge(y_train, on='id_annonce')

# On supprime quelques lignes avec des NA
X_train = X_train.dropna()

# On retrouve y et X
y_train = X_train[['id_annonce', 'price']]
X = X_train.drop(['price', 'id_annonce'], axis = 1)

# Modèle de régression linéaire multiple
ols = linear_model.LinearRegression()
result = ols.fit(X, np.log(y_train['price']))
param = result.coef_
preditiction = ols.predict(X)

# Computation des statistiques pour une belle sortie car scikit learn ne le fait pas.
# Ce dernier est à but prédictif et pas forcément causal

XXinv = np.linalg.inv(np.dot(X.T,X))
residus = np.log(y_train['price']) - preditiction
MSE_ols = (np.sum(residus**2))
var_res = (np.sum(residus**2)) / (len(X)-11)
cov = XXinv*var_res 
var_b = np.diag(cov)
tstat = param / np.sqrt(var_b)
pval = [2*(1-stats.t.cdf(np.abs(i), (len(X)-11))) for i in tstat]

df = pd.DataFrame()
df['name'], df['Coef'], df['Err. Stand'], df['tstat'], df['pval'] = [X.columns, param, np.sqrt(var_b), tstat, pval]

# Ridge regression: on centre et on réduit 

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
ridge = make_pipeline(StandardScaler(with_mean = False), linear_model.Ridge())
ridge = linear_model.Ridge(alpha = 1000)
ridge.fit(X, np.log(y_train['price']))

# Lasso regression
lasso = make_pipeline(StandardScaler(with_mean = False), linear_model.Lasso())
lasso = linear_model.Lasso(alpha = 0.2)
lasso.fit(X, np.log(y_train['price']))
df['Coeff Ridge']= ridge.coef_
df['Coeff Lasso']= lasso.coef_
print(df)

# On compare les modèles: ATTENTION MSE Calculé sur l'échantillon qui a servi à entrainer 
# les données

MSE_lasso = np.log(y_train['price']) - lasso.predict(X)
MSE_lasso = (np.sum(MSE_lasso**2))

MSE_ridge = np.log(y_train['price']) - ridge.predict(X)
MSE_ridge = (np.sum(MSE_ridge**2))

print(MSE_lasso, MSE_ridge, MSE_ols)

                    name      Coef    Err. Stand      tstat          pval  \
0                   size  0.000005  7.427366e-07   6.693864  2.207012e-11   
1               nb_rooms  0.026706  2.631553e-03  10.148349  0.000000e+00   
2            nb_bedrooms  0.091222  3.777867e-03  24.146325  0.000000e+00   
3      nb_parking_places  0.120283  8.619013e-03  13.955577  0.000000e+00   
4               nb_boxes -0.096469  1.045402e-02  -9.227936  0.000000e+00   
5              nb_photos  0.018279  7.662334e-04  23.855239  0.000000e+00   
6          has_a_balcony  0.224402  1.099655e-02  20.406550  0.000000e+00   
7            nb_terraces  0.252560  8.867202e-03  28.482483  0.000000e+00   
8           has_a_cellar  0.127190  9.773539e-03  13.013665  0.000000e+00   
9           has_a_garage  0.034915  1.694010e-02   2.061115  3.929961e-02   
10  has_air_conditioning  0.129557  1.973603e-02   6.564505  5.295053e-11   

    Coeff Ridge  Coeff Lasso  
0      0.000005     0.000004  
1      0.0275

In [None]:
# x_test_datachallenge = pd.read_csv('x_test')
# y_pred = lasso.predict('x_test')