# Lien vers le dataset :
https://www.kaggle.com/datasets/govindaramsriram/sleep-time-prediction

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import joblib


# Lecture et consultation

In [14]:
df = pd.read_csv('sleeptime_prediction_dataset.csv')

In [8]:
df.head()

Unnamed: 0,WorkoutTime,ReadingTime,PhoneTime,WorkHours,CaffeineIntake,RelaxationTime,SleepTime
0,1.12,0.52,3.29,7.89,216.08,0.75,3.45
1,2.85,0.49,4.22,5.03,206.18,0.67,4.88
2,2.2,1.81,4.04,9.23,28.73,0.35,3.61
3,1.8,0.5,1.62,7.68,276.77,1.21,4.94
4,0.47,0.54,1.6,4.94,170.54,0.95,5.5


In [9]:
df.isnull().sum()

WorkoutTime       0
ReadingTime       0
PhoneTime         0
WorkHours         0
CaffeineIntake    0
RelaxationTime    0
SleepTime         0
dtype: int64

In [10]:
df.describe()

Unnamed: 0,WorkoutTime,ReadingTime,PhoneTime,WorkHours,CaffeineIntake,RelaxationTime,SleepTime
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1.495915,0.992785,2.985195,6.926945,147.49378,1.010955,4.884375
std,0.876639,0.577303,1.151776,1.723843,84.651139,0.582619,2.028892
min,0.01,0.0,1.0,4.0,0.02,0.0,0.15
25%,0.71,0.5,1.99,5.44,75.98,0.51,3.84
50%,1.52,0.99,2.965,6.91,146.01,1.01,4.6
75%,2.25,1.5,3.96,8.4225,218.9025,1.53,5.47
max,3.0,2.0,5.0,10.0,299.85,2.0,19.81


# Training et sauvegard du meilleur model

In [21]:
# Define features and target variable
X = df.drop('SleepTime', axis=1)
y = df['SleepTime']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a list of models to evaluate
models = [
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Support Vector Machine', SVR()),
    ('Neural Network', MLPRegressor(max_iter=1000))
]

# Dictionary to store model predictions and metrics
model_performance = {}

best_r2 = -np.inf
best_model = None
best_model_name = ''

for name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    model_performance[name] = {'MSE': mse, 'R2 Score': r2}
    
    # Check if this model has the best R² score
    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_model_name = name
# Compare model performances
print("\nModel Performance Comparison:")
for model_name, metrics in model_performance.items():
    print(f"{model_name}: MSE = {metrics['MSE']:.4f}, R2 Score = {metrics['R2 Score']:.4f}")
# Save the best model to a file
if best_model is not None:
    print(f"\nSaving the best model: {best_model_name}")
    joblib.dump(best_model, 'best_model.joblib')
else:
    print("\nNo model was trained.")


Model Performance Comparison:
Linear Regression: MSE = 4.4978, R2 Score = 0.2055
Decision Tree: MSE = 7.7066, R2 Score = -0.3612
Random Forest: MSE = 5.0390, R2 Score = 0.1100
Gradient Boosting: MSE = 4.9377, R2 Score = 0.1278
Support Vector Machine: MSE = 5.6306, R2 Score = 0.0055
Neural Network: MSE = 4.7148, R2 Score = 0.1672

Saving the best model: Linear Regression
