In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Load dataset (replace 'Rentabilité.xlsx' with your file path)
df = pd.read_excel('Rentabilité.xlsx')

# Function to preprocess data and encode categorical features
def preprocess_data(df):
    # Encode 'Ressources' column
    encoder = OneHotEncoder(sparse=False, drop='first')
    resources_encoded = encoder.fit_transform(df[['Ressources']])
    resources_encoded_df = pd.DataFrame(resources_encoded, columns=encoder.get_feature_names_out(['Ressources']))

    # Concatenate encoded features with the original DataFrame
    df_encoded = pd.concat([df.drop(columns=['Ressources']), resources_encoded_df], axis=1)

    X = df_encoded[['J/H Vendus','Coût unitaire'] + list(df_encoded.columns[df_encoded.columns.str.startswith('Ressources_')])]
    y = df_encoded['Couts']
    return X, y

# Preprocess data
X, y = preprocess_data(df)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
ln_y_pred = model.predict(X_test)
ln_r2 = r2_score(y_test, ln_y_pred)
print("Linear Regression R-squared:", ln_r2)
# Train Decision knn
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)
knn_r2 = r2_score(y_test, knn_y_pred)
print("KNN R-squared:", knn_r2)

# Train Decision GBR
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_r2 = r2_score(y_test, gb_y_pred)
print("Gradient Boosting Regressor R-squared:", gb_r2)

# Train Decision Tree model
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_y_pred = dt_model.predict(X_test)
dt_r2 = r2_score(y_test, dt_y_pred)
print("Decision Tree R-squared:", dt_r2)

# Train Random Forest model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, rf_y_pred)
print("Random Forest R-squared:", rf_r2)

# Train Gradient Boosting model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_r2 = r2_score(y_test, gb_y_pred)
print("Gradient Boosting R-squared:", gb_r2)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate model
r2 = r2_score(y_test, y_pred)
print("R-squared:", gb_r2)

# Compare actual vs predicted values
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': gb_y_pred})
print(predictions_df.head())




Linear Regression R-squared: 0.8020551957756918
KNN R-squared: 0.7896027172196549
Gradient Boosting Regressor R-squared: 0.9379723973501641
Decision Tree R-squared: 0.9324596507419114
Random Forest R-squared: 0.9043528949650326
Gradient Boosting R-squared: 0.9275831545470008
R-squared: 0.9275831545470008
     Actual     Predicted
19    12000  14968.662638
45     2850   3282.580719
139    3700   3060.691151
30    30000  29133.885912
67     3000   2585.294938


In [49]:
import joblib

def save_model(model, filename):
    joblib.dump(model, filename)

# Save your models
save_model(gb_model, 'cost_model.pkl')

In [50]:
encoder = OneHotEncoder(sparse=False, drop='first')
resources_encoded = encoder.fit_transform(df[['Ressources']])
joblib.dump(encoder, 'encoder.pkl')



['encoder.pkl']