In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report , accuracy_score
from sklearn.model_selection import cross_val_score
from joblib import dump , load

In [17]:
data = pd.read_csv('../data/profile_data.csv')

In [18]:
data.rename(columns={'EducationSector':'Education'}, inplace=True)
data.rename(columns={'IndividualProject':'Projet_Individuel'}, inplace=True)
data.rename(columns={'Gender':'Genre'}, inplace=True)
data.rename(columns={'City':'EnVille'}, inplace=True)
data.rename(columns={'Influenced':'Influence'}, inplace=True)
data.rename(columns={'Perseverance':'DegPerseverance'}, inplace=True)
data.rename(columns={'SelfMotivation':'DegMotivation'}, inplace=True)
data.rename(columns={'DesireToTakeInitiative':'DegInitiative'}, inplace=True)
data.rename(columns={'Competitiveness':'DegCompetition'}, inplace=True)
data.rename(columns={'SelfReliance':'DegAutonomie'}, inplace=True)
data.rename(columns={'StrongNeedToAchieve':'DegBesoinReussite'}, inplace=True)
data.rename(columns={'SelfConfidence':'DegConfiance'}, inplace=True)
data.rename(columns={'GoodPhysicalHealth':'DegSante'}, inplace=True)
data.rename(columns={'MentalDisorder': 'TroubleMental'}, inplace=True)
data.rename(columns={'KeyTraits':'TraitsCles'}, inplace=True)
data.drop(columns=['ReasonsForLack'], inplace=True)


In [19]:
data.shape

(219, 16)

In [20]:
replacement_dict = {
    'Yes': 'Oui',
    'No': 'Non',
    'Male': 'Homme',
    'Female': 'Femme',
    'Engineering Sciences': 'Sciences de l\'ingénierie',
    'Others': 'Autres',
    'Economic Sciences, Business Studies, Commerce and Law': 'Sciences économiques, études commerciales, commerce et droit',
    'Art, Music or Design': 'Art, musique ou design',
    'Humanities and Social Sciences': 'Sciences humaines et sociales',
    'Medicine, Health Sciences': 'Médecine, sciences de la santé',
    'Teaching Degree (e.g., B.Ed)': 'Diplôme d\'enseignement',
    'Mathematics or Natural Sciences': 'Mathématiques ou sciences naturelles',
    'Language and Cultural Studies': 'Études linguistiques et culturelles'
}

data.replace(replacement_dict, inplace=True)
data.head()


Unnamed: 0,Education,Projet_Individuel,Age,Genre,EnVille,Influence,DegPerseverance,DegInitiative,DegCompetition,DegAutonomie,DegBesoinReussite,DegConfiance,DegSante,TroubleMental,TraitsCles,y
0,Sciences de l'ingénierie,Non,19,Homme,Oui,Non,2,2,3,3,2,2,3,Oui,Passion,1
1,Sciences de l'ingénierie,Oui,22,Homme,Non,Oui,3,3,3,4,4,3,4,Oui,Vision,0
2,Sciences de l'ingénierie,Non,18,Homme,Oui,Non,3,4,3,3,3,4,4,Non,Passion,0
3,Sciences de l'ingénierie,Oui,20,Homme,Oui,Oui,3,3,3,3,4,3,3,Non,Resilience,0
4,Sciences de l'ingénierie,Oui,19,Homme,Oui,Oui,2,3,3,3,4,3,2,Oui,Vision,1


In [21]:
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.shape

(219, 16)

In [22]:
dump(data['Education'].unique(), './profile/education_valeurs.pkl')
dump(data['TraitsCles'].unique(), './profile/traits_valeurs.pkl')

['./profile/traits_valeurs.pkl']

In [23]:
encoders = {
    'Education': LabelEncoder(),
    'TraitsCles': LabelEncoder(),
    'Genre': LabelEncoder(),
    'EnVille': LabelEncoder(),
    'Projet_Individuel': LabelEncoder(),
    'Influence': LabelEncoder(),
    'TroubleMental': LabelEncoder()
}

for col , encoder in encoders.items():
    data[col] = encoder.fit_transform(data[col])
    dump(encoder, f"./profile/{col}.pkl")

print("Encoders saved successfully!")
data.head()


Encoders saved successfully!


Unnamed: 0,Education,Projet_Individuel,Age,Genre,EnVille,Influence,DegPerseverance,DegInitiative,DegCompetition,DegAutonomie,DegBesoinReussite,DegConfiance,DegSante,TroubleMental,TraitsCles,y
0,5,0,19,1,1,0,2,2,3,3,2,2,3,1,0,1
1,5,1,22,1,0,1,3,3,3,4,4,3,4,1,3,0
2,5,0,18,1,1,0,3,4,3,3,3,4,4,0,0,0
3,5,1,20,1,1,1,3,3,3,3,4,3,3,0,2,0
4,5,1,19,1,1,1,2,3,3,3,4,3,2,1,3,1


In [24]:
X = data.drop(['y'] , axis=1)
y = data['y']

In [25]:
dump(X.columns, './profile/profile_colonnes.pkl')

['./profile/profile_colonnes.pkl']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# accuracy 59%
model=RandomForestClassifier(random_state=42)
# accuracy 63% , split 30 % , random_state=42
#model=KNeighborsClassifier(n_neighbors=3) 
model.fit(X_train,y_train)

In [28]:
y_pred = model.predict(X_test)
print("Random Forest Classifier")
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(cross_val_score(model, X, y, cv=10).mean())

Random Forest Classifier
0.5909090909090909
              precision    recall  f1-score   support

           0       0.61      0.85      0.71        26
           1       0.50      0.22      0.31        18

    accuracy                           0.59        44
   macro avg       0.56      0.53      0.51        44
weighted avg       0.57      0.59      0.55        44

0.5670995670995671


In [29]:
dump(model, './profile/profile_model.pkl')
print("Model saved")

Model saved


*testing model*

In [30]:
model = load('./profile/profile_model.pkl')
print("Model loaded")

data = pd.DataFrame([
    {
        "Education": "Sciences de l'ingénierie",
        "Projet_Individuel": "Non",
        "Age": 19,
        "Genre": "Femme",
        "EnVille": "Oui",
        "Influence": "Non",
        "DegPerseverance": 2,
        "DegInitiative": 2,
        "DegCompetition": 3,
        "DegAutonomie": 3,
        "DegBesoinReussite": 2,
        "DegConfiance": 2,
        "DegSante": 3,
        "TroubleMental": "Oui",
        "TraitsCles": "Passion",
    },
    {
        "Education": "Sciences de l'ingénierie",
        "Projet_Individuel": "Oui",
        "Age": 22,
        "Genre": "Homme",
        "EnVille": "Non",
        "Influence": "Oui",
        "DegPerseverance": 3,
        "DegInitiative": 3,
        "DegCompetition": 3,
        "DegAutonomie": 4,
        "DegBesoinReussite": 4,
        "DegConfiance": 3,
        "DegSante": 4,
        "TroubleMental": "Oui",
        "TraitsCles": "Vision",
    }
])

columns_to_encode = ['Education', 'Projet_Individuel', 'Genre', 'EnVille', 
                     'Influence', 'TroubleMental', 'TraitsCles']

for col in columns_to_encode:
    encoder = load(f"./profile/{col}.pkl")
    data[col] = encoder.transform(data[col])

prediction = model.predict(data)
print(prediction)


Model loaded
[1 0]
