In [70]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [48]:
Dataframe =  pd.read_csv("student-scores.csv")
Dataframe.shape

(2000, 17)

In [49]:
Dataframe.duplicated().sum()

0

In [50]:
Dataframe.isna().sum()


id                            0
first_name                    0
last_name                     0
email                         0
gender                        0
part_time_job                 0
absence_days                  0
extracurricular_activities    0
weekly_self_study_hours       0
career_aspiration             0
math_score                    0
history_score                 0
physics_score                 0
chemistry_score               0
biology_score                 0
english_score                 0
geography_score               0
dtype: int64

In [51]:
cat_cols = Dataframe.select_dtypes(include='object').columns
cat_cols

Index(['first_name', 'last_name', 'email', 'gender', 'career_aspiration'], dtype='object')

In [52]:
Dataframe=Dataframe.drop(columns=['first_name','last_name','email','id'])
Dataframe

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,male,False,2,False,30,Construction Engineer,83,77,84,73,75,84,82
1996,male,False,2,False,20,Software Engineer,89,65,73,80,87,67,73
1997,female,False,5,False,14,Software Engineer,97,85,63,93,68,94,78
1998,female,True,10,True,5,Business Owner,51,96,72,89,95,88,75


In [53]:
Dataframe['total_score'] = Dataframe['math_score'] + Dataframe['history_score'] + Dataframe['physics_score'] + Dataframe['chemistry_score'] + Dataframe['biology_score'] + Dataframe['english_score'] + Dataframe['geography_score']
Dataframe['average_score'] = Dataframe['total_score'] / 7
Dataframe.head()

Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87,574,82.0
1,female,False,2,False,47,Doctor,90,86,96,100,90,88,90,640,91.428571
2,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94,605,86.428571
3,female,False,5,False,3,Artist,71,74,88,80,89,63,86,551,78.714286
4,male,False,5,False,10,Unknown,84,77,65,65,80,74,76,521,74.428571


In [54]:
Dataframe['career_aspiration'].value_counts()


career_aspiration
Software Engineer        315
Business Owner           309
Unknown                  223
Banker                   169
Lawyer                   138
Accountant               126
Doctor                   119
Real Estate Developer     83
Stock Investor            73
Construction Engineer     68
Artist                    67
Game Developer            63
Government Officer        61
Teacher                   59
Designer                  56
Scientist                 39
Writer                    32
Name: count, dtype: int64

In [55]:
len(Dataframe['career_aspiration'].unique())

17

In [56]:
cols = ['gender','part_time_job','extracurricular_activities', 'career_aspiration']
for col in cols:
    Dataframe[col] = LabelEncoder().fit_transform(Dataframe[col])

In [57]:
Dataframe['career_aspiration'].value_counts()

career_aspiration
12    315
3     309
15    223
2     169
9     138
0     126
6     119
10     83
13     73
4      68
1      67
7      63
8      61
14     59
5      56
11     39
16     32
Name: count, dtype: int64

In [58]:
score_columns = ['absence_days','weekly_self_study_hours','math_score', 'history_score', 'physics_score', 'chemistry_score', 'biology_score', 'english_score', 'geography_score','total_score','average_score']
scaler = StandardScaler()
Dataframe[score_columns] = scaler.fit_transform(Dataframe[score_columns])
Dataframe.head()


Unnamed: 0,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score,total_score,average_score
0,1,0,-0.253175,0,0.762334,9,-0.790525,0.052463,0.930377,1.331147,-1.208673,-0.106245,0.525321,0.168795,0.168795
1,0,0,-0.633604,0,2.411605,6,0.49525,0.445147,1.169682,1.565986,0.759435,0.559086,0.783168,1.729633,1.729633
2,0,0,2.029397,1,-0.392155,8,-0.185454,1.309054,1.089913,1.252867,-1.062888,-0.355744,1.126964,0.901916,0.901916
3,0,0,0.507682,0,-1.216791,1,-0.941792,-0.497296,0.531536,0.000391,0.686542,-1.520075,0.439372,-0.375133,-0.375133
4,1,0,0.507682,0,-0.639546,15,0.041447,-0.261685,-1.303134,-1.173804,0.030506,-0.605244,-0.420119,-1.084605,-1.084605


In [59]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 98)

X = Dataframe.drop("career_aspiration", axis=1)
y = Dataframe['career_aspiration']

X_resampled ,  y_resampled = smote.fit_resample(X,y)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size= 0.2, random_state=98)

In [61]:
scaler=StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [62]:
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.8543417366946778

In [71]:
mse = mean_squared_error(y_test,y_pred)
mse

8.180205415499533

In [63]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.89      0.90        61
           1       0.85      0.90      0.88        70
           2       0.75      0.77      0.76        66
           3       0.92      0.90      0.91        67
           4       0.76      0.99      0.86        71
           5       0.96      0.79      0.87        68
           6       0.89      0.98      0.93        48
           7       0.84      0.94      0.89        70
           8       0.87      0.96      0.92        56
           9       0.79      0.84      0.81        67
          10       0.90      0.86      0.88        65
          11       0.92      0.96      0.94        70
          12       0.58      0.32      0.42        68
          13       0.93      0.80      0.86        54
          14       0.92      0.97      0.94        70
          15       0.71      0.66      0.68        41
          16       0.93      0.97      0.95        59

    accuracy              

In [64]:
print(confusion_matrix(y_test,y_pred))

[[54  0  2  0  0  0  0  0  0  1  0  0  3  0  0  1  0]
 [ 0 63  0  1  0  0  0  3  0  0  3  0  0  0  0  0  0]
 [ 0  0 51  0  3  0  0  0  1  5  0  0  4  0  1  1  0]
 [ 0  1  0 60  0  0  0  3  0  0  3  0  0  0  0  0  0]
 [ 0  0  0  0 70  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  1  1  1  0 54  0  2  4  0  0  0  1  0  2  2  0]
 [ 0  0  0  0  0  0 47  0  0  0  0  1  0  0  0  0  0]
 [ 0  4  0  0  0  0  0 66  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  2 54  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  1  0  0 56  0  0  3  2  0  3  2]
 [ 0  0  0  3  0  1  0  2  2  0 56  0  0  0  1  0  0]
 [ 0  0  0  0  2  0  0  0  0  1  0 67  0  0  0  0  0]
 [ 3  0 11  0 14  1  4  0  0  7  0  3 22  1  0  1  1]
 [ 1  0  1  0  1  0  1  1  0  0  0  0  2 43  1  3  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  0  0  0 68  0  1]
 [ 1  5  1  0  1  0  0  0  0  1  0  2  2  0  1 27  0]
 [ 0  0  1  0  1  0  0  0  0  0  0  0  0  0  0  0 57]]


In [65]:
import pickle

# pickle.dump(classifier, open('model.pkl','wb'))       #model.pkl is not required to be overridden again

In [66]:
model = pickle.load(open("model.pkl", 'rb'))


In [67]:
professions = [
    "Accountant",
    "Artist",
    "Banker",
    "Business Owner",
    "Construction Engineer",
    "Designer",
    "Doctor",
    "Game Developer",
    "Government Officer",
    "Lawyer",
    "Real Estate Developer",
    "Scientist",
    "Software Engineer",
    "Stock Investor",
    "Teacher",
    "Unknown",
    "Writer"
]

In [68]:
def recommendation(gender, part_time_job, absence_days, extracurricular_activities,
       weekly_self_study_hour, math_score,history_score, physics_score, chemistry_score, biology_score,
       english_score, geography_score, total_score, average_score):
    gender_encoded = 1 if gender.lower() =='male' else 0
    part_time_job_encoded = 1 if part_time_job else 0
    extracurricular_activities_encoded = 1 if extracurricular_activities else 0
    
    feature_array = np.array([[gender_encoded, part_time_job_encoded, absence_days, extracurricular_activities_encoded,
                               weekly_self_study_hour, math_score, history_score, physics_score, chemistry_score,
                               biology_score, english_score, geography_score, total_score, average_score]])
    
    probabilities = model.predict_proba(feature_array)
    top_classes_idx = np.argsort(-probabilities[0])[:5]
    top_classes_names_probs = [(professions[idx], probabilities[0][idx]) for idx in top_classes_idx]
    return top_classes_names_probs

In [69]:
gender = 'female'  
part_time_job = True
absence_days = 2
extracurricular_activities = False  
weekly_self_study_hour = 4
math_score = 67
history_score = 50
physics_score = 67
chemistry_score = 56
biology_score = 40
english_score = 80
geography_score = 0
total_score = 583
average_score = 83.2456

final_recommendation = recommendation(gender, part_time_job, absence_days, extracurricular_activities,
                                      weekly_self_study_hour, math_score, history_score, physics_score,
                                      chemistry_score, biology_score, english_score, geography_score,
                                      total_score, average_score)

# Print the top recommendations
print("Top Recommendations")
print("-" * 50)
for name, prob in final_recommendation:
    print(f"{name} with probability {prob*100}")

Top Recommendations
--------------------------------------------------
Doctor with probability 76.0
Writer with probability 4.0
Unknown with probability 4.0
Software Engineer with probability 3.0
Lawyer with probability 3.0
