# Online Courses Recommendation Model

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [2]:
df = pd.read_excel("Online Course Preferences (Responses).xlsx")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 13 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   Highest Qualification                  102 non-null    object
 1   Field/Background                       102 non-null    object
 2   Reason for taking an online course     102 non-null    object
 3   Preferred Mode of Course               102 non-null    object
 4   Preferred Course Duration (in hours)   102 non-null    object
 5   Preferred Price (in Rs)                102 non-null    object
 6   Preferred Course Level                 102 non-null    object
 7   Preferred Course Type                  102 non-null    object
 8   Preferred Certification Type           102 non-null    object
 9   Preferred Mentorship/ 1:1 Interaction  102 non-null    object
 10  Preferred Validity of Course Content   102 non-null    object
 11  Preferred Course Tr

In [4]:
df.columns

Index(['Highest Qualification', 'Field/Background',
       'Reason for taking an online course', 'Preferred Mode of Course',
       'Preferred Course Duration (in hours)', 'Preferred Price (in Rs)',
       'Preferred Course Level', 'Preferred Course Type',
       'Preferred Certification Type', 'Preferred Mentorship/ 1:1 Interaction',
       'Preferred Validity of Course Content',
       'Preferred Course Training Type', 'Which type of courses do you like?'],
      dtype='object')

In [5]:
df['Preferred Price (in Rs)'].unique()

array(['500 to 1500', '3000 to 5000', '< 500', '1500 to 3000',
       '5000 to 10000', '10000+'], dtype=object)

In [6]:
from sklearn.utils import resample
from sklearn.utils import shuffle

df_s1=df[df['Preferred Price (in Rs)']=='500 to 1500']
df_s2=df[df['Preferred Price (in Rs)']=='3000 to 5000']
df_s3=df[df['Preferred Price (in Rs)']=='< 500']
df_s4=df[df['Preferred Price (in Rs)']=='1500 to 3000']
df_s5=df[df['Preferred Price (in Rs)']=='5000 to 10000']
df_s6=df[df['Preferred Price (in Rs)']=='10000+']
df_s1_upsampled=resample(df_s1,replace=True, n_samples=50)
df_s2_upsampled=resample(df_s2,replace=True, n_samples=50)
df_s3_upsampled=resample(df_s3,replace=True, n_samples=50)
df_s4_upsampled=resample(df_s4,replace=True, n_samples=50)
df_s5_upsampled=resample(df_s5,replace=True, n_samples=50)
df_s6_upsampled=resample(df_s6,replace=True, n_samples=50)

df_s_combined=pd.concat([df_s1_upsampled,df_s2_upsampled,df_s3_upsampled,df_s4_upsampled,df_s5_upsampled,df_s6_upsampled])

df_new=shuffle(df_s_combined)

df_new.shape

(300, 13)

In [9]:
X = df_new.drop(columns=['Highest Qualification', 'Field/Background','Preferred Price (in Rs)'])

In [10]:
y=df_new['Preferred Price (in Rs)']

In [11]:
X

Unnamed: 0,Reason for taking an online course,Preferred Mode of Course,Preferred Course Duration (in hours),Preferred Course Level,Preferred Course Type,Preferred Certification Type,Preferred Mentorship/ 1:1 Interaction,Preferred Validity of Course Content,Preferred Course Training Type,Which type of courses do you like?
17,Placement,Offline,6 to 15 hours,Beginner,Course + Internship,Certified Professional Course,Yes,Lifetime,Live + Recorded,Paid course with free demo sessions
44,"Certificate, Interest in subject, Need for skill",Online,26 to 35 hours,Intermediate,Course + Capstone Project,Certified Professional Course,Yes,Limited Time,Live + Recorded,Paid course with free demo sessions
100,"Internship, Interest in subject, Need for skill",Online,6 to 15 hours,Beginner,Course + Capstone Project + Internship,Certified Professional Course,Yes,Lifetime,Live + Recorded,Paid course with free demo sessions
17,Placement,Offline,6 to 15 hours,Beginner,Course + Internship,Certified Professional Course,Yes,Lifetime,Live + Recorded,Paid course with free demo sessions
30,Need for skill,Online,26 to 35 hours,Intermediate,Course + Internship,Certified Professional Course,Yes,Lifetime,Live + Recorded,Paid course with free demo sessions
...,...,...,...,...,...,...,...,...,...,...
26,"Interest in subject, Need for skill",Online,26 to 35 hours,Beginner,Only Course,Course Completion Certification,No,Limited Time,Recorded,Paid course with free demo sessions
99,"CV Point, Interest in subject, Need for skill",Online,35+ hours,Intermediate,Course + Capstone Project,Certified Professional Course,No,Lifetime,Recorded,Paid course with free demo sessions
51,"Certificate, CV Point, Internship, Need for skill",Online,6 to 15 hours,Beginner,Course + Capstone Project + Internship,Certified Professional Course,Yes,Lifetime,Live,Paid course with free demo sessions
17,Placement,Offline,6 to 15 hours,Beginner,Course + Internship,Certified Professional Course,Yes,Lifetime,Live + Recorded,Paid course with free demo sessions


In [12]:
y

17           10000+
44      500 to 1500
100    1500 to 3000
17           10000+
30      500 to 1500
           ...     
26           10000+
99            < 500
51      500 to 1500
17           10000+
72            < 500
Name: Preferred Price (in Rs), Length: 300, dtype: object

In [13]:
X_1 = pd.get_dummies(X)
X_1.shape

(300, 69)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=0.20, random_state = 10)

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((240, 69), (60, 69), (240,), (60,))

## ML Model 1: Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score,roc_curve,roc_auc_score
from sklearn.metrics import classification_report

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)
print("Confusion matrix:")
print("\n")
print(lr_conf_matrix)
print("-------------------------------------------")
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print("-------------------------------------------")
print("\n")
print(classification_report(y_test,lr_predict))

Confusion matrix:


[[13  0  0  0  0  0]
 [ 0  9  0  0  0  0]
 [ 0  0  9  0  0  0]
 [ 0  1  2  6  0  0]
 [ 0  0  0  0 10  0]
 [ 0  1  1  1  0  7]]
-------------------------------------------


Accuracy of Logistic Regression: 90.0 

-------------------------------------------


               precision    recall  f1-score   support

       10000+       1.00      1.00      1.00        13
 1500 to 3000       0.82      1.00      0.90         9
 3000 to 5000       0.75      1.00      0.86         9
  500 to 1500       0.86      0.67      0.75         9
5000 to 10000       1.00      1.00      1.00        10
        < 500       1.00      0.70      0.82        10

     accuracy                           0.90        60
    macro avg       0.90      0.89      0.89        60
 weighted avg       0.91      0.90      0.90        60



## ML Model 2: Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

tree_mob=DecisionTreeClassifier()

tree_mob=tree_mob.fit(X_train,y_train)

In [17]:
#Prediction of values using Decision Tree

y_pred=tree_mob.predict(X_test)

y_pred

array(['10000+', '5000 to 10000', '5000 to 10000', '1500 to 3000',
       '3000 to 5000', '5000 to 10000', '500 to 1500', '< 500',
       '1500 to 3000', '< 500', '10000+', '10000+', '3000 to 5000',
       '1500 to 3000', '3000 to 5000', '< 500', '1500 to 3000', '10000+',
       '3000 to 5000', '500 to 1500', '1500 to 3000', '5000 to 10000',
       '< 500', '5000 to 10000', '< 500', '10000+', '5000 to 10000',
       '1500 to 3000', '10000+', '< 500', '1500 to 3000', '5000 to 10000',
       '500 to 1500', '< 500', '5000 to 10000', '10000+', '500 to 1500',
       '500 to 1500', '10000+', '3000 to 5000', '3000 to 5000',
       '3000 to 5000', '< 500', '1500 to 3000', '5000 to 10000', '10000+',
       '500 to 1500', '1500 to 3000', '1500 to 3000', '< 500', '10000+',
       '10000+', '5000 to 10000', '3000 to 5000', '500 to 1500', '< 500',
       '10000+', '10000+', '3000 to 5000', '1500 to 3000'], dtype=object)

In [18]:
tree_mob_report=classification_report(y_test,y_pred)
tree_mob_acc_score = accuracy_score(y_test, y_pred)

print(' The classification report of Decision Tree:\n',tree_mob_report)
print("-------------------------------------------")
print("\n")
print(' The accuracy of the model is: ',tree_mob_acc_score*100,'\n')

 The classification report of Decision Tree:
                precision    recall  f1-score   support

       10000+       1.00      1.00      1.00        13
 1500 to 3000       0.82      1.00      0.90         9
 3000 to 5000       1.00      1.00      1.00         9
  500 to 1500       1.00      0.78      0.88         9
5000 to 10000       1.00      1.00      1.00        10
        < 500       0.90      0.90      0.90        10

     accuracy                           0.95        60
    macro avg       0.95      0.95      0.95        60
 weighted avg       0.96      0.95      0.95        60

-------------------------------------------


 The accuracy of the model is:  95.0 



## ML Model 3: Random Forest

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix,roc_auc_score

rf=RandomForestClassifier(random_state=10)

rf_gs=GridSearchCV(rf,{'n_estimators':range(105,110),'criterion':['gini'],'max_depth':range(3,12)})

rf_gs.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=10),
             param_grid={'criterion': ['gini'], 'max_depth': range(3, 12),
                         'n_estimators': range(105, 110)})

In [20]:
rf_report=classification_report(y_test,rf_gs.predict(X_test))

rf_cm=confusion_matrix(y_test,rf_gs.predict(X_test))

rf_acc_score = accuracy_score(y_test, rf_gs.predict(X_test))

print(' RF- New Classification report:\n',rf_report)
print(' RF - New Confusion Matrix:\n', rf_cm)
print("-------------------------------------------")
print("\n")
print("Accuracy of Random Forest Classifier after Grid Search CV:",rf_acc_score*100,'\n')
print("-------------------------------------------")

 RF- New Classification report:
                precision    recall  f1-score   support

       10000+       1.00      1.00      1.00        13
 1500 to 3000       0.82      1.00      0.90         9
 3000 to 5000       0.90      1.00      0.95         9
  500 to 1500       1.00      0.67      0.80         9
5000 to 10000       1.00      1.00      1.00        10
        < 500       0.90      0.90      0.90        10

     accuracy                           0.93        60
    macro avg       0.94      0.93      0.92        60
 weighted avg       0.94      0.93      0.93        60

 RF - New Confusion Matrix:
 [[13  0  0  0  0  0]
 [ 0  9  0  0  0  0]
 [ 0  0  9  0  0  0]
 [ 0  1  1  6  0  1]
 [ 0  0  0  0 10  0]
 [ 0  1  0  0  0  9]]
-------------------------------------------


Accuracy of Random Forest Classifier after Grid Search CV: 93.33333333333333 

-------------------------------------------


In [21]:
# Sort the values of importance

imp_feature=pd.DataFrame({'Feature':X_train.columns, 'Importance':rf_gs.best_estimator_.feature_importances_*100})

feature_rank=imp_feature.sort_values('Importance',ascending=False)
feature_rank

Unnamed: 0,Feature,Importance
50,Preferred Course Level_Advanced,7.629537
39,Reason for taking an online course_Placement,7.435414
56,Preferred Course Type_Guided Project,5.134786
52,Preferred Course Level_Intermediate,4.828358
51,Preferred Course Level_Beginner,4.705970
...,...,...
10,Reason for taking an online course_Certificate...,0.049442
0,Reason for taking an online course_CV Point,0.048543
12,Reason for taking an online course_Certificate...,0.000000
4,"Reason for taking an online course_CV Point, N...",0.000000


In [22]:
# Features to focus more

feature_rank[feature_rank['Importance']>2.5]

Unnamed: 0,Feature,Importance
50,Preferred Course Level_Advanced,7.629537
39,Reason for taking an online course_Placement,7.435414
56,Preferred Course Type_Guided Project,5.134786
52,Preferred Course Level_Intermediate,4.828358
51,Preferred Course Level_Beginner,4.70597
47,Preferred Course Duration (in hours)_35+ hours,4.683983
43,Preferred Mode of Course_Offline,4.070663
54,Preferred Course Type_Course + Capstone Projec...,3.942793
45,Preferred Course Duration (in hours)_16 to 25 ...,3.808643
48,Preferred Course Duration (in hours)_6 to 15 h...,3.229868


## Thank You :)