In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from utils_2 import Utils
#from utils import k_fold_cross
from smote_2 import Smote
from logistic_regression_3 import LogisticRegression
from naive_bayes import NaiveBayes
from pca import PCA

In [2]:
## Reading the data
df = pd.read_csv('students_adaptability_level_online_education.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gender               1205 non-null   object
 1   Age                  1205 non-null   object
 2   Education Level      1205 non-null   object
 3   Institution Type     1205 non-null   object
 4   IT Student           1205 non-null   object
 5   Location             1205 non-null   object
 6   Load-shedding        1205 non-null   object
 7   Financial Condition  1205 non-null   object
 8   Internet Type        1205 non-null   object
 9   Network Type         1205 non-null   object
 10  Class Duration       1205 non-null   object
 11  Self Lms             1205 non-null   object
 12  Device               1205 non-null   object
 13  Adaptivity Level     1205 non-null   object
dtypes: object(14)
memory usage: 131.9+ KB


In [3]:
# Checking the features
df.describe().T

Unnamed: 0,count,unique,top,freq
Gender,1205,2,Boy,663
Age,1205,6,21-25,374
Education Level,1205,3,School,530
Institution Type,1205,2,Non Government,823
IT Student,1205,2,No,901
Location,1205,2,Yes,935
Load-shedding,1205,2,Low,1004
Financial Condition,1205,3,Mid,878
Internet Type,1205,2,Mobile Data,695
Network Type,1205,3,4G,775


In [4]:
## Preprocessing the categorical data 
preprocesser = Utils()
df = preprocesser.labeling(df)
preprocesser.mapping

[{'Gender': {'Boy': 0, 'Girl': 1}},
 {'Age': {'21-25': 0,
   '16-20': 1,
   '11-15': 2,
   '26-30': 3,
   '6-10': 4,
   '1-5': 5}},
 {'Education Level': {'University': 0, 'College': 1, 'School': 2}},
 {'Institution Type': {'Non Government': 0, 'Government': 1}},
 {'IT Student': {'No': 0, 'Yes': 1}},
 {'Location': {'Yes': 0, 'No': 1}},
 {'Load-shedding': {'Low': 0, 'High': 1}},
 {'Financial Condition': {'Mid': 0, 'Poor': 1, 'Rich': 2}},
 {'Internet Type': {'Wifi': 0, 'Mobile Data': 1}},
 {'Network Type': {'4G': 0, '3G': 1, '2G': 2}},
 {'Class Duration': {'3-6': 0, '1-3': 1, '0': 2}},
 {'Self Lms': {'No': 0, 'Yes': 1}},
 {'Device': {'Tab': 0, 'Mobile': 1, 'Computer': 2}},
 {'Adaptivity Level': {'Moderate': 0, 'Low': 1, 'High': 2}}]

In [5]:
## Handling imbalances in data
smote = Smote(df, 200, 5)
generated_data = smote.smote()
generated_data = np.append(generated_data, np.ones((generated_data.shape[0],1)) * 2, axis=1)
df = df.append(pd.DataFrame(generated_data, columns=df.columns.tolist()))
df.shape


  df = df.append(pd.DataFrame(generated_data, columns=df.columns.tolist()))


(1405, 14)

In [16]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1:].values

In [7]:
## Applying one hot encoding to response
one_hot_df = preprocesser.one_hot_encoding(df, 'Adaptivity Level')
one_hot_df.rename(columns={'Adaptivity Level_0.0': 'Adaptivity Level_Moderate', 
                           'Adaptivity Level_1.0': 'Adaptivity Level_Low', 
                           'Adaptivity Level_2.0': 'Adaptivity Level_High',}, inplace=True)

In [8]:
df.reset_index(drop=True, inplace=True)

In [9]:
one_hot_df.reset_index(drop=True, inplace=True)
one_hot_df = one_hot_df.reindex(columns=one_hot_df.columns.tolist()[:13] + ['Adaptivity Level_High', 'Adaptivity Level_Low', 'Adaptivity Level_Moderate'])
one_hot_df.head()

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level_High,Adaptivity Level_Low,Adaptivity Level_Moderate
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0,0,1
2,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0,0,1
3,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0,0,1
4,1.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,0,1,0


In [10]:
# Logistic Regression
logistic_regression = LogisticRegression(0.03, 10000)
weight = np.zeros((13,1))
print(logistic_regression.k_fold_cross(one_hot_df, weight, 10))

0.37857142857142856
0.35
0.34285714285714286
0.2357142857142857
0.38571428571428573
0.34285714285714286
0.30714285714285716
0.35714285714285715
0.37857142857142856
0.3
0.3378571428571428


In [11]:
logistic_regression = LogisticRegression(0.03, 10000, True, 2, 0.1)
print(logistic_regression.k_fold_cross(one_hot_df, weight, 10))

0.35714285714285715
0.34285714285714286
0.2785714285714286
0.37142857142857144
0.35
0.3357142857142857
0.34285714285714286
0.36428571428571427
0.2857142857142857
0.32857142857142857
0.33571428571428574


In [13]:
logistic_regression = LogisticRegression(0.03, 10000, True, 1, 0.1)
print(logistic_regression.k_fold_cross(one_hot_df, weight, 10))

0.3357142857142857
0.38571428571428573
0.32857142857142857
0.35714285714285715
0.29285714285714287
0.2714285714285714
0.37142857142857144
0.35
0.35
0.2857142857142857
0.33285714285714285


In [17]:
# applying pca
pca = PCA(X)

In [18]:
pca.pve(8)

(13, 13)


0.8780500611160481

In [22]:
pca_x = pca.examine(8)

new_features = X @ pca_x

(13, 13)


In [24]:
new_features = pd.DataFrame(new_features, columns=["PC"+str(i) for i in range(1,9)])
pca_df = pd.concat([new_features, one_hot_df.iloc[:, -3:]], axis=1)
pca_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,Adaptivity Level_High,Adaptivity Level_Low,Adaptivity Level_Moderate
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
1,-0.016382,-0.264376,0.393696,0.68926,-0.684856,-0.196187,-1.282159,-0.733923,0,0,1
2,1.380476,-0.285977,0.386642,0.567124,0.407751,-0.358297,-1.299652,0.403229,0,0,1
3,2.781571,-0.630096,-0.137826,0.402572,-0.255571,0.069385,-1.310062,-0.483087,0,0,1
4,2.082154,-0.626845,-0.904242,1.731236,-0.41115,-0.003939,-2.128188,-0.373822,0,1,0


In [25]:
logistic_regression = LogisticRegression(0.03, 10000, True, 1, 0.1)
weight = np.zeros((8,1))
print(logistic_regression.k_fold_cross(pca_df, weight, 10))

0.42142857142857143
0.30714285714285716
0.42857142857142855
0.37142857142857144
0.37142857142857144
0.36428571428571427
0.4
0.34285714285714286
0.37857142857142856
0.37857142857142856
0.37642857142857145


In [27]:
## Naive Bayes
nb = NaiveBayes()
nb.k_fold_cross(df, 10)


yvector (1405, 1)


0.36928571428571433

In [None]:
pca_df = pd.concat([new_features, df.iloc[:, -1:]], axis=1)
print(nb.k_fold_cross)