<a href="https://colab.research.google.com/github/celiamarrakchi/Predicting-Length-of-Stay-for-Hospitalized-Patients/blob/main/Regression_Polynomiale.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Chargement des données + Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
#donnes des patients avec admission merger reglée + generation d autres lignes (1000 lignes)
# Charger les données
df = pd.read_csv('expanded_df.csv')
df
# Compter toutes les valeurs dans la colonne
compte_valeurs = df['language_encoded'].value_counts()

# Afficher uniquement la valeur 0.0000
print(f"Nombre de valeurs égales à 0.0000 : {compte_valeurs.get(0.0000, 0)}")

# Calculer les valeurs min et max de la colonne 'language_encoded'
valeur_min = df['language_encoded'].min()
valeur_max = df['language_encoded'].max()

print(f"Valeur minimale : {valeur_min}")
print(f"Valeur maximale : {valeur_max}")
print(df['language_encoded'].describe())

df2=df.copy()
# 1. Identifier les indices des valeurs égales à 0.0
indices_zero = df2[df2['language_encoded'] == 0.0].index

# 2. Sélectionner 50% des indices aléatoirement
nombre_a_remplacer = len(indices_zero) // 2  # 50% des valeurs égales à 0
indices_a_remplacer = np.random.choice(indices_zero, size=nombre_a_remplacer, replace=False)

# 3. Remplacer ces valeurs par des nombres aléatoires entre 0 et 1
df2.loc[indices_a_remplacer, 'language_encoded'] = np.random.uniform(0, 1, size=nombre_a_remplacer)

# Afficher les statistiques après remplacement
print("\nStatistiques après remplacement :")
print(df2['language_encoded'].describe())


Nombre de valeurs égales à 0.0000 : 446
Valeur minimale : 0.0
Valeur maximale : 1.0
count    1000.000000
mean        0.171935
std         0.360585
min         0.000000
25%         0.000000
50%         0.001961
75%         0.013519
max         1.000000
Name: language_encoded, dtype: float64

Statistiques après remplacement :
count    1000.000000
mean        0.291054
std         0.393633
min         0.000000
25%         0.000902
50%         0.010654
75%         0.675529
max         1.000000
Name: language_encoded, dtype: float64


#Suprresion des colonnes innutiles

In [None]:
df2=df2.drop(columns=['expire_flag','hospital_expire_flag','age_at_death'])
df2

Unnamed: 0,admission_location,discharge_location,insurance,religion,marital_status,ethnicity,diagnosis,has_chartevents_data,gender,length_of_stay,duration_ed,admission_type_encoded,language_encoded
0,0.250000,0.333333,0.333333,0.111111,0.400000,0.250000,0.776596,1.000000,0.000000,0.070993,0.264134,0.000000,0.692810
1,0.750000,0.000000,0.000000,0.111111,0.600000,0.875000,0.382979,1.000000,0.000000,0.111450,0.234982,0.000000,0.000000
2,0.750000,0.000000,0.333333,0.111111,0.200000,0.875000,0.776596,1.000000,0.000000,0.021078,0.234982,0.000000,0.010954
3,0.250000,1.000000,0.333333,0.111111,0.000000,1.000000,0.393617,1.000000,0.000000,0.064701,0.334806,0.000000,0.000000
4,0.750000,0.000000,0.333333,0.111111,0.000000,1.000000,0.085106,1.000000,1.000000,0.004830,0.234982,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000046,0.996685,0.323491,0.106411,0.206175,0.483368,0.342598,0.999184,1.000000,0.042721,0.227673,0.010398,0.985581
996,0.234546,0.000000,0.329980,0.538014,0.211517,0.123983,0.515506,1.000000,0.994168,0.072594,0.347371,0.002087,0.260230
997,0.756543,0.009055,0.334361,0.107944,0.205304,0.988945,0.045658,0.992796,0.988389,0.038998,0.225740,0.000000,0.009116
998,0.228297,0.000000,0.326150,1.000000,1.000000,0.998987,0.271241,1.000000,0.999052,0.012916,0.094105,0.005085,0.871065


#Calcul et Affichage de R²

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import r2_score

# Définir les caractéristiques (X) et la cible (y)
X = df2.drop(columns=['length_of_stay'])
y = df2['length_of_stay']

# Diviser les données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiser les hyperparamètres
alphas = [0.0001, 0.001, 0.01]
degrees = [3]
results_r2 = {}

def evaluate_r2(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    return train_r2, test_r2

# Calculer R² pour Ridge et Lasso
for degree in degrees:
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    for alpha in alphas:
        # Ridge Regression
        ridge_poly = Ridge(alpha=alpha)
        train_r2, test_r2 = evaluate_r2(ridge_poly, X_train_poly, X_test_poly, y_train, y_test)
        results_r2[f'Ridge (deg={degree}, alpha={alpha})'] = {'Train R²': train_r2, 'Test R²': test_r2}

        # Lasso Regression
        lasso_poly = Lasso(alpha=alpha, max_iter=5000)
        train_r2, test_r2 = evaluate_r2(lasso_poly, X_train_poly, X_test_poly, y_train, y_test)
        results_r2[f'Lasso (deg={degree}, alpha={alpha})'] = {'Train R²': train_r2, 'Test R²': test_r2}

# Afficher les résultats R²
results_r2_df = pd.DataFrame(results_r2).T
print("\nPerformance des modèles (R²) :")
print(results_r2_df)


  model = cd_fast.enet_coordinate_descent(



Performance des modèles (R²) :
                             Train R²   Test R²
Ridge (deg=3, alpha=0.0001)  0.979385  0.931933
Lasso (deg=3, alpha=0.0001)  0.808933  0.717174
Ridge (deg=3, alpha=0.001)   0.973874  0.939050
Lasso (deg=3, alpha=0.001)   0.315516  0.271019
Ridge (deg=3, alpha=0.01)    0.964802  0.932223
Lasso (deg=3, alpha=0.01)    0.000000 -0.006134


# Calcul et Affichage de RMSE

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

# Définir les caractéristiques (X) et la cible (y)
X = df2.drop(columns=['length_of_stay'])
y = df2['length_of_stay']

# Diviser les données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialiser les hyperparamètres
alphas = [0.0001, 0.001, 0.01]
degrees = [3]
results_rmse = {}

def evaluate_rmse(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    return train_rmse, test_rmse

# Calculer RMSE pour Ridge et Lasso
for degree in degrees:
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)

    for alpha in alphas:
        # Ridge Regression
        ridge_poly = Ridge(alpha=alpha)
        train_rmse, test_rmse = evaluate_rmse(ridge_poly, X_train_poly, X_test_poly, y_train, y_test)
        results_rmse[f'Ridge (deg={degree}, alpha={alpha})'] = {'Train RMSE': train_rmse, 'Test RMSE': test_rmse}

        # Lasso Regression
        lasso_poly = Lasso(alpha=alpha, max_iter=5000)
        train_rmse, test_rmse = evaluate_rmse(lasso_poly, X_train_poly, X_test_poly, y_train, y_test)
        results_rmse[f'Lasso (deg={degree}, alpha={alpha})'] = {'Train RMSE': train_rmse, 'Test RMSE': test_rmse}

# Afficher les résultats RMSE
results_rmse_df = pd.DataFrame(results_rmse).T
print("\nPerformance des modèles (RMSE) :")
print(results_rmse_df)


  model = cd_fast.enet_coordinate_descent(



Performance des modèles (RMSE) :
                             Train RMSE  Test RMSE
Ridge (deg=3, alpha=0.0001)    0.014515   0.024848
Lasso (deg=3, alpha=0.0001)    0.044191   0.050651
Ridge (deg=3, alpha=0.001)     0.016341   0.023513
Lasso (deg=3, alpha=0.001)     0.083641   0.081318
Ridge (deg=3, alpha=0.01)      0.018967   0.024795
Lasso (deg=3, alpha=0.01)      0.101097   0.095534


#Conclusion

Meilleur Modèle : Ridge (deg=3, alpha=0.001)
Train R² : 0.9739
Test R² : 0.9391
Pourquoi ?
Test R² élevé : Ce modèle a la meilleure performance sur l'ensemble de test (0.9391), ce qui indique une bonne capacité de généralisation.
Train R² équilibré : Le Train R² n'est pas trop éloigné du Test R², ce qui montre un modèle bien régularisé, évitant le sur-apprentissage.