In this project the dataset in the following link is used:

https://www.kaggle.com/marlonferrari/elearning-student-reactions

Our aim is to establish a classification model based on this dataset to predict the approvement situation of a student by his/her time online and different post features.


In [20]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [21]:
import pandas as pd 
online = pd.read_csv("online_classroom_data.csv",sep = ',') 
# Preview the first 5 lines of the loaded data 
online["sk1_classroom"] = online["sk1_classroom"].str.replace(",",".").astype(float)
online["sk2_classroom"] = online["sk2_classroom"].str.replace(",",".").astype(float)
online["sk3_classroom"] = online["sk3_classroom"].str.replace(",",".").astype(float)
online["sk4_classroom"] = online["sk4_classroom"].str.replace(",",".").astype(float)
online["sk5_classroom"] = online["sk5_classroom"].str.replace(",",".").astype(float)
online.head()

Unnamed: 0.1,Unnamed: 0,total_posts,helpful_post,nice_code_post,collaborative_post,confused_post,creative_post,bad_post,amazing_post,timeonline,sk1_classroom,sk2_classroom,sk5_classroom,sk3_classroom,sk4_classroom,Approved
0,0,1.0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,1600.0,2.1,2.4,3.5,3.6,1.7,0
1,1,1.0,0.0,0.0,1.0,0.0,2.0,0.0,3.0,592.0,0.3,0.3,0.0,0.1,0.2,0
2,2,2.0,4.0,3.0,9.0,0.0,16.0,1.0,8.0,1110.0,8.0,5.0,5.0,7.0,5.0,1
3,3,5.0,1.0,3.0,9.0,2.0,11.0,0.0,8.0,8651.0,6.0,5.0,4.0,6.0,4.0,1
4,4,14.0,6.0,15.0,28.0,0.0,50.0,0.0,45.0,34172.0,8.7,9.0,6.5,10.0,8.8,1


In [22]:
online_dropped=online.drop(['sk1_classroom','sk2_classroom','sk3_classroom','sk4_classroom','sk5_classroom'],axis=1)
online_dropped2=online_dropped.drop(['Unnamed: 0','total_posts', 'Approved'],axis=1)
online_dropped2.head()

Unnamed: 0,helpful_post,nice_code_post,collaborative_post,confused_post,creative_post,bad_post,amazing_post,timeonline
0,0.0,0.0,0.0,0.0,6.0,0.0,1.0,1600.0
1,0.0,0.0,1.0,0.0,2.0,0.0,3.0,592.0
2,4.0,3.0,9.0,0.0,16.0,1.0,8.0,1110.0
3,1.0,3.0,9.0,2.0,11.0,0.0,8.0,8651.0
4,6.0,15.0,28.0,0.0,50.0,0.0,45.0,34172.0


In [23]:
data_X=online_dropped2.copy()
data_labels=online["Approved"]
from sklearn.model_selection import train_test_split
data_X_train, data_X_test, data_labels_train, data_labels_test = train_test_split(data_X, data_labels, test_size=0.2, random_state=101)
data_X_train.head()

Unnamed: 0,helpful_post,nice_code_post,collaborative_post,confused_post,creative_post,bad_post,amazing_post,timeonline
21,8.0,14.0,33.0,1.0,35.0,0.0,26.0,20765.0
66,53.0,90.0,89.0,3.0,150.0,0.0,155.0,39314.0
1,0.0,0.0,1.0,0.0,2.0,0.0,3.0,592.0
26,6.0,13.0,15.0,0.0,26.0,0.0,34.0,27669.0
70,8.0,9.0,19.0,2.0,22.0,0.0,17.0,6826.0


In [24]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(data_X_train, data_labels_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, data_X_train, data_labels_train, cv=3, scoring="accuracy")

array([0.89473684, 0.78947368, 0.72222222])

In [26]:
#some_digit=data_X_train[10]
sample=np.array([0.0,0.0,1.0,0.0,2.0,0.0,3.0,592.0])
sample.reshape(-1,1)
sample2=np.array([53.0,90.0,89.0,3.0,150.0,0.0,155.0,39314.0])
sample2.reshape(-1,1)
sgd_clf.predict([sample])

array([1], dtype=int64)

In [27]:
data_labels_train.head()

21    1
66    1
1     0
26    1
70    1
Name: Approved, dtype: int64

Here, we see that the timeonline dominates the other features. We can think of scaling the features.

In [28]:
from sklearn.preprocessing import MinMaxScaler
sc_X=MinMaxScaler()
sc_y=MinMaxScaler()
data_X = sc_X.fit_transform(data_X)
data_labels=pd.DataFrame(data_labels)
data_labels = sc_y.fit_transform(data_labels)
data_X_train, data_X_test, data_labels_train, data_labels_test = train_test_split(data_X, data_labels, test_size=0.2, random_state=101)
data_X_train=pd.DataFrame(data_X_train)
data_X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.150943,0.155556,0.370787,0.142857,0.233333,0.0,0.167742,0.47613
1,1.0,1.0,1.0,0.428571,1.0,0.0,1.0,0.901449
2,0.0,0.0,0.011236,0.0,0.013333,0.0,0.019355,0.013574
3,0.113208,0.144444,0.168539,0.0,0.173333,0.0,0.219355,0.634435
4,0.150943,0.1,0.213483,0.285714,0.146667,0.0,0.109677,0.156517


In [29]:
data_labels_train=data_labels_train.ravel()
sgd_clf.fit(data_X_train, data_labels_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [30]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, data_X_train, data_labels_train, cv=3, scoring="accuracy")

array([0.68421053, 0.78947368, 0.77777778])

In [31]:
sample=np.array([0.000000,0.000000,0.011236,0.000000,0.013333,0.0,0.019355,0.013574])
sample.reshape(-1,1)
sample2=np.array([0.113208,0.144444,0.168539,0.000000,0.173333,0.0,0.219355,0.634435])
sample2.reshape(-1,1)
print(sgd_clf.predict([sample]))
print(sgd_clf.predict([sample2]))

[0.]
[1.]


In [32]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, data_X_train, data_labels_train, cv=3, scoring="accuracy")

array([0.68421053, 0.78947368, 0.77777778])

In [33]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(sgd_clf.predict(data_X_test), data_labels_test)
mse = mean_squared_error(sgd_clf.predict(data_X_test), data_labels_test)
rmse = np.sqrt(mse)

print('Mean Absolute Error (MAE): %.2f' % mae)
print('Mean Squared Error (MSE): %.2f' % mse)
print('Root Mean Squared Error (RMSE): %.2f' % rmse)

Mean Absolute Error (MAE): 0.07
Mean Squared Error (MSE): 0.07
Root Mean Squared Error (RMSE): 0.26


Here, we computed the error of test data. It gives a reasonable amount of error as we expected. Afterwards, we will do the model a bit more complex by adding a shuffle technique into it. Now, we apply a random forest algorithm to do the classification.

In [45]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=3, random_state=41)
forest_clf.fit(data_X_train, data_labels_train)
mae = mean_absolute_error(forest_clf.predict(data_X_test), data_labels_test.ravel())
mse = mean_squared_error(forest_clf.predict(data_X_test), data_labels_test.ravel())
rmse = np.sqrt(mse)
data_labels_test_rf_predicted=forest_clf.predict(data_X_test)
print('Mean Absolute Error (MAE): %.2f' % mae)
print('Mean Squared Error (MSE): %.2f' % mse)
print('Root Mean Squared Error (RMSE): %.2f' % rmse)

Mean Absolute Error (MAE): 0.07
Mean Squared Error (MSE): 0.07
Root Mean Squared Error (RMSE): 0.26


  This is separate from the ipykernel package so we can avoid doing imports until


Here, we are suspicious about the error. We wonder if there is a leakage in the features. We conclude that it is obvious if you are interested in the class to get good comments and some online interaction you can almost accept yourself approved.

In [46]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(data_labels_test, data_labels_test_rf_predicted))
from sklearn.metrics import precision_score, recall_score
print('Precision score: %.2f' % precision_score(data_labels_test, data_labels_test_rf_predicted))
print('Recall score: %.2f' % recall_score(data_labels_test, data_labels_test_rf_predicted))
from sklearn.metrics import f1_score
print('F1 score: %.2f' % f1_score(data_labels_test, data_labels_test_rf_predicted))

[[ 3  0]
 [ 1 11]]
Precision score: 1.00
Recall score: 0.92
F1 score: 0.96


Now, we explore the index properties of the dataframe to do a shuffle index.

In [47]:
data_labels_train=pd.DataFrame(data_labels_train)
data_labels_train.index
data_labels_train.index.values

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55], dtype=int64)

In [53]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=37, shuffle=True)

for train_index, test_index in skfolds.split(data_X_train, data_labels_train):
    
    clone_clf = clone(sgd_clf)
    X_train_folds = data_X_train.iloc[train_index.astype(int)]
    y_train_folds = data_labels_train.iloc[train_index.astype(int)]
    X_test_fold = data_X_train.iloc[test_index.astype(int)]
    y_test_fold = data_labels_train.iloc[test_index.astype(int)]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred.ravel() == y_test_fold.values.ravel())
    print(n_correct / len(y_pred))

0.8421052631578947
0.8421052631578947
0.8333333333333334


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [54]:
data_labels_test_predicted=clone_clf.predict(data_X_test)
mae = mean_absolute_error(data_labels_test_predicted, data_labels_test)
mse = mean_squared_error(data_labels_test_predicted, data_labels_test)
rmse = np.sqrt(mse)
print('Mean Absolute Error (MAE): %.2f' % mae)
print('Mean Squared Error (MSE): %.2f' % mse)
print('Root Mean Squared Error (RMSE): %.2f' % rmse)

Mean Absolute Error (MAE): 0.00
Mean Squared Error (MSE): 0.00
Root Mean Squared Error (RMSE): 0.00


In [55]:
from sklearn.metrics import precision_score, recall_score
print('Precision score: %.2f' % precision_score(data_labels_test, data_labels_test_rf_predicted))
print('Recall score: %.2f' % recall_score(data_labels_test, data_labels_test_rf_predicted))
from sklearn.metrics import f1_score
print('F1 score: %.2f' % f1_score(data_labels_test, data_labels_test_rf_predicted))

Precision score: 1.00
Recall score: 0.92
F1 score: 0.96


Here, we have choices like SVC however the recent model gives the estimation of approved or not with %85 percent accuracy. Our data is very straightforward. If someone has nice posts then he/she is approved in the class.