In [82]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn import metrics
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

#### Définition de nos variables cibles 

In [83]:
data = pd.read_csv("../data/flight_data.csv")

In [84]:
Y = data['Price']
X = data.drop(['Price'], axis=1)

#### Divisez les données en ensembles d’entraînement et de test

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2,random_state=70)

# Set MLflow

In [86]:
mlflow.set_tracking_uri("sqlite:///mlflowtest.db")
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Flight price prediction tracking")

2024/01/01 01:32:52 INFO mlflow.tracking.fluent: Experiment with name 'Flight price prediction tracking' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/514094643223700237', creation_time=1704069172749, experiment_id='514094643223700237', last_update_time=1704069172749, lifecycle_stage='active', name='Flight price prediction tracking', tags={}>

In [87]:
mlflow.end_run()

In [88]:
mlflow.start_run()
mlflow.log_param("test_size", 0.2)
mlflow.log_param("random_state", 42)

42

# Linear Regression

In [89]:
mlflow.end_run()

In [90]:
with mlflow.start_run(run_name='Linear regression model'):
    reg_model=LinearRegression().fit(X_train,y_train)
    y_pred_val =reg_model.predict(X_test)

    mse_val =mean_squared_error(y_test,y_pred_val)
    mae_val=metrics.mean_absolute_error(y_test,y_pred_val)
    rmse_val=np.sqrt(metrics.mean_absolute_error(y_test,y_pred_val))
    r2_val =r2_score(y_test,y_pred_val)
    
    val_metrics = {'r2_score': r2_val, 'mse':mse_val,'rmse':rmse_val,'mae':mae_val}
    mlflow.log_metrics(val_metrics)
    mlflow.sklearn.log_model(reg_model,"Linear regression model")
mlflow.end_run()
    



# Random Forest

In [91]:
with mlflow.start_run(run_name='Random Forest Regressor Model'):
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred_val_rf = rf.predict(X_test)

    mse_val_rf = mean_squared_error(y_test, y_pred_val_rf)
    mae_val_rf = metrics.mean_absolute_error(y_test, y_pred_val_rf)
    rmse_val_rf = np.sqrt(mean_squared_error(y_test, y_pred_val_rf))
    r2_val_rf = r2_score(y_test, y_pred_val_rf)

    val_metrics_rf = {'r2_score': r2_val_rf, 'mse': mse_val_rf, 'rmse': rmse_val_rf, 'mae': mae_val_rf}
    mlflow.log_metrics(val_metrics_rf)
    mlflow.sklearn.log_model(rf, "Random Forest Regressor Model")
mlflow.end_run()



# XG BOOST

In [92]:
with mlflow.start_run(run_name='XGBoost Regressor Model'):
    # Define hyperparameters
    xg_params = {
        'objective': 'reg:squarederror',
        'colsample_bytree': 0.3,
        'learning_rate': 0.1,
        'max_depth': 5,
        'alpha': 10,
        'n_estimators': 10
    }
    xg_reg = xgb.XGBRegressor(**xg_params) 

    xg_reg.fit(X_train, y_train)
    y_pred_val_xgb = xg_reg.predict(X_test)

    mse_val_xgb = mean_squared_error(y_test, y_pred_val_xgb)
    mae_val_xgb = metrics.mean_absolute_error(y_test, y_pred_val_xgb)
    rmse_val_xgb = np.sqrt(mean_squared_error(y_test, y_pred_val_xgb))
    r2_val_xgb = r2_score(y_test, y_pred_val_xgb)

    val_metrics_xgb = {'r2_score': r2_val_xgb, 'mse': mse_val_xgb, 'rmse': rmse_val_xgb, 'mae': mae_val_xgb}
    mlflow.log_metrics(val_metrics_xgb)
    mlflow.log_params(xg_params)
  
    mlflow.xgboost.log_model(xg_reg, "XGBoost Regressor Model")
    
mlflow.end_run()



# Decision Tree

In [93]:
with mlflow.start_run(run_name='Decision Tree Model'):
    dt_model = DecisionTreeRegressor()

    dt_model.fit(X_train, y_train)
    y_pred_val = dt_model.predict(X_test)


    mse_val =mean_squared_error(y_test,y_pred_val)
    mae_val=metrics.mean_absolute_error(y_test,y_pred_val)
    rmse_val=np.sqrt(metrics.mean_absolute_error(y_test,y_pred_val))
    r2_val =r2_score(y_test,y_pred_val)


    val_metrics = {'r2_score': r2_val, 'mse':mse_val,'rmse':rmse_val,'mae':mae_val}
    mlflow.log_metrics(val_metrics)
    mlflow.sklearn.log_model(dt_model,"Decision TreeRegressor model")
mlflow.end_run()



# Performing Hypermater tuning on Random Analysis

In [94]:
test_df = pd.DataFrame({
    "Predicted Price" : rf.predict(X_val),
    "Actual Price" : y_val,
}).reset_index(drop = True)
test_df

Unnamed: 0,Predicted Price,Actual Price
0,4364.225775,4310.67
1,1679.596100,1870.05
2,2431.993200,2511.60
3,1789.369855,1629.81
4,18320.987100,24182.34
...,...,...
453,2473.138265,2140.32
454,2288.075400,2921.10
455,9063.190500,8506.68
456,11337.874275,12538.89


In [95]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, scoring='r2', cv=5, verbose=1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Meilleurs paramètres :", best_params)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits


405 fits failed out of a total of 1215.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Dell\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Dell\anaconda3\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\Dell\anaconda3\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Dell\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameter

Meilleurs paramètres : {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [96]:
with mlflow.start_run(run_name='Random Forest Regressor Model'):
    rf_params={'max_depth': 100, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
    rf2 = RandomForestRegressor(**rf_params)
    rf2.fit(X_train, y_train)
    y_pred_val_rf = rf2.predict(X_test)

    mse_val_rf = mean_squared_error(y_test, y_pred_val_rf)
    mae_val_rf = metrics.mean_absolute_error(y_test, y_pred_val_rf)
    rmse_val_rf = np.sqrt(mean_squared_error(y_test, y_pred_val_rf))
    r2_val_rf = r2_score(y_test, y_pred_val_rf)

    val_metrics_rf = {'r2_score': r2_val_rf, 'mse': mse_val_rf, 'rmse': rmse_val_rf, 'mae': mae_val_rf}
    mlflow.log_metrics(val_metrics_rf)
    mlflow.sklearn.log_model(rf, "Random Forest Regressor Model")
mlflow.end_run()



In [97]:
test_df = pd.DataFrame({
    "Predicted Price" : rf.predict(X_test),
    "Actual Price" : y_test,
}).reset_index(drop = True)
test_df

Unnamed: 0,Predicted Price,Actual Price
0,11743.978155,10540.53
1,2112.579560,1665.30
2,4649.590400,4657.38
3,7819.027125,8159.97
4,1938.441505,1583.40
...,...,...
453,7809.710545,7753.20
454,11286.475200,11438.70
455,5122.408200,5211.57
456,1441.374935,1984.71


In [98]:
test_df = pd.DataFrame({
    "Predicted Price" : rf2.predict(X_test),
    "Actual Price" : y_test,
}).reset_index(drop = True)
test_df

Unnamed: 0,Predicted Price,Actual Price
0,11505.446972,10540.53
1,2022.549165,1665.30
2,4648.494608,4657.38
3,7604.645382,8159.97
4,2038.272037,1583.40
...,...,...
453,7618.748193,7753.20
454,10940.302100,11438.70
455,5417.364983,5211.57
456,1781.039563,1984.71


##### Hyperparameter tuning didn't improve the model accuracy