In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from joblib import dump # for saving the model

In [2]:
data = pd.read_csv('../data/projet_faisabilite.csv')

In [3]:
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)

In [4]:
def categorize_score(score):
    if score < 50:
        return 'Faible'
    elif 50 <= score < 75:
        return 'Moyenne'
    else:
        return 'Élevée'

data['Faisabilite'] = data['Score_faisabilite'].apply(categorize_score)
print(data[['Score_faisabilite', 'Faisabilite']].head())

   Score_faisabilite Faisabilite
0                 48      Faible
1                 69     Moyenne
2                 52     Moyenne
3                 92      Élevée
4                 56     Moyenne


In [5]:
data.drop('Score_faisabilite', axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,Budget_initial,Source_financement,Estimation_ROI,Taille_marche,Demande_actuelle,Nombre_concurrents,Avantage_concurrentiel,Impact_social,Impact_environnemental,Conformite_normes,Prevision_rentabilite,Capacite_fonds,Faisabilite
0,50k-100k,Prêt bancaire,1-3 ans,National,Modérée,Aucun,Qualité,Autre,Polluant,Non,<1 an,Difficile,Faible
1,<50k,Fonds propres,<1 an,International,Modérée,Aucun,Prix,Aucun,Polluant,Non,1-3 ans,Difficile,Moyenne
2,<50k,Investisseur,>3 ans,Régional,Forte,Aucun,Innovation,Autre,Écologique,Non,>3 ans,Difficile,Moyenne
3,<50k,Fonds propres,<1 an,Local,Modérée,Aucun,Innovation,Aucun,Écologique,Non,>3 ans,Facile,Élevée
4,>100k,Fonds propres,>3 ans,Local,Forte,>5,Aucun,Création emplois,Neutre,Oui,>3 ans,Facile,Moyenne


In [7]:
encoder = LabelEncoder()
for col in data.drop(['Faisabilite'] , axis=1).columns:
    if data[col].dtype == 'object':
        data[col] = encoder.fit_transform(data[col])

In [8]:
X= data.drop('Faisabilite', axis=1)
y= data['Faisabilite']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
model=RandomForestClassifier(
    n_estimators=10000,
    max_depth=10)
model.fit(X_train,y_train)

In [11]:
y_pred = model.predict(X_test)
print("RandomForestClassifier")
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

RandomForestClassifier
0.3520880220055014
              precision    recall  f1-score   support

      Faible       0.20      0.03      0.06      1087
     Moyenne       0.35      0.39      0.37      1448
      Élevée       0.36      0.55      0.44      1464

    accuracy                           0.35      3999
   macro avg       0.31      0.32      0.29      3999
weighted avg       0.32      0.35      0.31      3999



In [None]:
#dump(model, './faisabilite/faisabilite_model.pkl')
#print("Model saved")

Model saved
