# Prédictions avec scikitlearn

In [98]:
from selenium import webdriver
from bs4 import BeautifulSoup as BS
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import re
import time
from serde import serialize, deserialize
from serde.json import to_json, from_json
import pandas as pd
from requests import get 
from dataclasses import dataclass
from pays import Countries

from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from rich import print
from rich.table import Table

import sklearn.metrics

In [62]:
df = pd.read_pickle("my_df.pkl")

In [63]:
df

Unnamed: 0,description,prix,lieux,duree,diff,theme
0,plaisirs neige &#224; sixt-fer-&#224;-cheval,690,metropole,8,Initié,multi-activités
1,haute clar&#233;e &quot;espace trappeur&quot;,860,metropole,7,Découverte,multi-activités
2,aventures hivernales sur le plateau de l&#39;a...,720,metropole,6,Découverte,multi-activités
3,neige et soleil du queyras en famille,730,metropole,7,Découverte,multi-activités
4,plaisirs neige dans les aravis,830,metropole,7,Découverte,multi-activités
...,...,...,...,...,...,...
383,plongée à la réunion,1540,dest_éloignée,10,Initié,multi-activités
384,randonnée dans le parc naturel de topes de col...,1690,dest_éloignée,14,Découverte,rando nature
385,"randonnée jusqu'au mont ryten, vue sur kvalvika",1930,dest_éloignée,15,Découverte,rando nature
386,drapée masai flottant dans les terres du rift,2350,dest_éloignée,8,Découverte,rando nature


In [64]:
df.shape

(388, 6)

In [65]:
X = df.drop(["description", "prix"], axis = 1)
y = df["prix"]

In [66]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y)

In [67]:
results = dict()

In [68]:
one_hot_encoder = Pipeline(
  steps=[
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
  ]
)

In [69]:
preprocessor = ColumnTransformer(
  transformers=[
    ('categorical', one_hot_encoder, ['lieux', 'diff', 'theme']),
  ]
)

## reg lin pure 

In [129]:
p = Pipeline(
  steps=[
    ('preprocessor', preprocessor),
    ('reg', LinearRegression())
  ]
)

p.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('categorical',
                                    Pipeline(steps=[('one_hot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['lieux', 'diff', 'theme'])])),
  ('reg', LinearRegression())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('categorical',
                                  Pipeline(steps=[('one_hot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['lieux', 'diff', 'theme'])]),
 'reg': LinearRegression(),
 'preprocessor__n_jobs': None,
 'preprocessor__remainder': 'drop',
 'preprocessor__sparse_threshold': 0.3,
 'preprocessor__transformer_weights': None,
 'preprocessor__transformers': [('categorical',
   Pipeline(steps=[('one_hot', OneHotEncoder(handle_unknown='ignore'))]),
   ['lieux', 'diff', 'theme'])],
 'pre

In [130]:
g = GridSearchCV(
    p,
    {
        "reg__fit_intercept": [True, False]
    },
   
)
g.fit(X_tr, y_tr)

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['lieux',
                                                                          'diff',
                                                                          'theme'])])),
                                       ('reg', LinearRegression())]),
             param_grid={'reg__fit_intercept': [True, False]})

In [131]:
results["Lineaire pure"] = g

## rf regressor

In [132]:
p = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("Reg", RandomForestRegressor())
    ]
)
p.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('categorical',
                                    Pipeline(steps=[('one_hot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['lieux', 'diff', 'theme'])])),
  ('Reg', RandomForestRegressor())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('categorical',
                                  Pipeline(steps=[('one_hot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['lieux', 'diff', 'theme'])]),
 'Reg': RandomForestRegressor(),
 'preprocessor__n_jobs': None,
 'preprocessor__remainder': 'drop',
 'preprocessor__sparse_threshold': 0.3,
 'preprocessor__transformer_weights': None,
 'preprocessor__transformers': [('categorical',
   Pipeline(steps=[('one_hot', OneHotEncoder(handle_unknown='ignore'))]),
   ['lieux', 'diff', 'theme'

In [133]:
g = GridSearchCV(
    p,
    {
        "Reg__n_estimators": range(50, 300, 20),
    },
    n_jobs=-1,
   
)
g.fit(X_tr, y_tr)


GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['lieux',
                                                                          'diff',
                                                                          'theme'])])),
                                       ('Reg', RandomForestRegressor())]),
             n_jobs=-1, param_grid={'Reg__n_estimators': range(50, 300, 20)})

In [134]:
results["random forest"] = g

## reseau de neurone

In [136]:
p = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("Reg", MLPRegressor())
    ]
)
p.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('categorical',
                                    Pipeline(steps=[('one_hot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['lieux', 'diff', 'theme'])])),
  ('Reg', MLPRegressor())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('categorical',
                                  Pipeline(steps=[('one_hot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['lieux', 'diff', 'theme'])]),
 'Reg': MLPRegressor(),
 'preprocessor__n_jobs': None,
 'preprocessor__remainder': 'drop',
 'preprocessor__sparse_threshold': 0.3,
 'preprocessor__transformer_weights': None,
 'preprocessor__transformers': [('categorical',
   Pipeline(steps=[('one_hot', OneHotEncoder(handle_unknown='ignore'))]),
   ['lieux', 'diff', 'theme'])],
 'preprocesso

In [137]:
g = GridSearchCV(
    p,
    {
        'Reg__hidden_layer_sizes': [(50,), (100,), (150,), (200,), (250,), (300,),],
        "Reg__max_iter": [5000],
        "Reg__activation": ["relu", "logistic"],
    },
    n_jobs=-1,
)
g.fit(X_tr, y_tr)

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['lieux',
                                                                          'diff',
                                                                          'theme'])])),
                                       ('Reg', MLPRegressor())]),
             n_jobs=-1,
             param_grid={'Reg__activation': ['relu', 'logistic'],
                         'Reg__hidden_layer_sizes': [(50,), (100,), (150,),
                                                     (200,), (250,), (300,)],
                         'Reg__max_iter': [5000]})

In [138]:
results["reseau neurone"]=g

## SVM

In [142]:
p = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("Reg", SVR())
    ]
)
p.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('categorical',
                                    Pipeline(steps=[('one_hot',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    ['lieux', 'diff', 'theme'])])),
  ('Reg', SVR())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('categorical',
                                  Pipeline(steps=[('one_hot',
                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                  ['lieux', 'diff', 'theme'])]),
 'Reg': SVR(),
 'preprocessor__n_jobs': None,
 'preprocessor__remainder': 'drop',
 'preprocessor__sparse_threshold': 0.3,
 'preprocessor__transformer_weights': None,
 'preprocessor__transformers': [('categorical',
   Pipeline(steps=[('one_hot', OneHotEncoder(handle_unknown='ignore'))]),
   ['lieux', 'diff', 'theme'])],
 'preprocessor__verbose': False

In [143]:
g = GridSearchCV(
    p,
    {
        "Reg__C": [ 0.1, 1., 10., 100,],
        "Reg__epsilon": [0.01, 0.1, 1., 10.]
    },
    n_jobs=-1,
)
g.fit(X_tr, y_tr)

GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('one_hot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['lieux',
                                                                          'diff',
                                                                          'theme'])])),
                                       ('Reg', SVR())]),
             n_jobs=-1,
             param_grid={'Reg__C': [0.1, 1.0, 10.0, 100],
                         'Reg__epsilon': [0.01, 0.1, 1.0, 10.0]})

In [144]:
results["SVR"] = g

## Bilan 

In [145]:
tbl = Table(
    title="Résumé des résultats de crossvalidation.",
    show_header=True,
)
tbl.add_column("Nom")
tbl.add_column("Score Cross validation")
tbl.add_column("Score entrainement")
tbl.add_column("Choix Hyperparamètres")
for nom, modele in results.items():
    tbl.add_row(
        nom, 
        f"{modele.best_score_:.2f}", 
        f"{modele.score(X_tr, y_tr):.2f}",
        str(modele.best_params_),
    )
    
print(tbl)

In [123]:
models_names = ["Random Forest", "Linear Regression", "ElasticNet"]

pipelines = [
  Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestRegressor())]),
  Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LinearRegression())]),
  Pipeline(steps=[('preprocessor', preprocessor), ('classifier', ElasticNet())])
]

In [124]:
for p, name in zip(pipelines, models_names):
  p.fit(X_tr, y_tr)
  y_pred = p.predict(X_te)
  print("Score ({}) : {:2.1f}".format(
    name,
    p.score(X_te, y_te)
  ))

  