# Step 1 - Loading the Required Libraries and Modules 

In [91]:
from sklearn.preprocessing import LabelEncoder
import dask
import dask.array as da
from dask_ml.datasets import make_classification
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from dask_ml.wrappers import Incremental



# Step 2 - Read data. Creating train, test

In [92]:
df = pd.read_csv('../result_data/final_v3.csv') 


In [93]:
le = LabelEncoder()
df['chuyennganh2'] = le.fit_transform(df['chuyennganh2'])

In [94]:
# Dữ liệu train gồm các khóa 8, 9, 10, 11, 12, 13
data_train = df[df["khoahoc"] < 14]
data_train1 = df[df["khoahoc"]==8]
data_train2 = df[df["khoahoc"]==9]
data_train3 = df[df["khoahoc"]==10]
data_train4 = df[df["khoahoc"]==11]
data_train5 = df[df["khoahoc"]==12]
data_train6 = df[df["khoahoc"]==13]

#  Dữ liệu test là khóa 14
data_test = df[df["khoahoc"]>=14]

print("Số mẫu train1:", data_train1.shape[0])
print("Số mẫu train2:", data_train2.shape[0])
print("Số mẫu train3:", data_train3.shape[0])
print("Số mẫu train4:", data_train4.shape[0])
print("Số mẫu train5:", data_train5.shape[0])
print("Số mẫu train6:", data_train6.shape[0])

print("Số mẫu test:", data_test.shape[0])
print("Tỉ lệ tập test: ", data_test.shape[0]/df.shape[0])

Số mẫu train1: 6888
Số mẫu train2: 7402
Số mẫu train3: 7671
Số mẫu train4: 7616
Số mẫu train5: 7528
Số mẫu train6: 9396
Số mẫu test: 9684
Tỉ lệ tập test:  0.17235917059713446


In [95]:
# Khu vực
khuvuc = [col for col in df.columns if col.startswith("khuvuc")]

# Khoa
khoa = [col for col in df.columns if col.startswith("khoa")]

# Hệ đào tạo
hedt = [col for col in df.columns if col.startswith("hedt")]

# Thuộc tính đầu vào
# input = ["gioitinh", "xl_tt", "xl_av", "khoahoc"] + khuvuc + khoa + hedt + ["dtbtl", "drltl"]
input = ["gioitinh", "xl_av", "khoahoc", "chuyennganh2", "xl_tt", "hocky_sx" , "sotchk", "dtbtl", "drltl", 'dtbhk_truoc', 'drlhk_truoc', 'sotctl', 'somon_khongdat_hktruoc' ] + khuvuc + khoa + hedt 
# Output
output = "xeploai"

In [96]:
for i in range(1, 7):
    exec(f'X_train_{i} = data_train{i}[input]')
    exec(f'y_train_{i} = data_train{i}[output]')


In [97]:
X_test = data_test[input]
y_test = data_test[output]

In [98]:
classes = da.unique(y_train_1).compute()
classes

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

# Step3 - Building, Predicting, and Evaluating the models

## SGDClassifier

In [99]:
est = SGDClassifier(loss='log', penalty='l2', tol=1e-3)
inc = Incremental(est, scoring='accuracy')


In [100]:
inc.fit(X_train_1, y_train_1, classes=classes)




In [101]:
inc.score(X_test, y_test)

0.22263527467988434

In [102]:
inc.partial_fit(X_train_2, y_train_2, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.3878562577447336


In [103]:
inc.partial_fit(X_train_3, y_train_3, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.22439074762494837


In [104]:
inc.partial_fit(X_train_4, y_train_4, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.16935150764147047


In [105]:
inc.partial_fit(X_train_5, y_train_5, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.4047914085088806


In [106]:
inc.partial_fit(X_train_6, y_train_6, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.12980173482032217


In [107]:
predict_train = inc.predict(X_train_6)
predict_test = inc.predict(X_test)

In [108]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_train_6,predict_train))
print(classification_report(y_train_6,predict_train))

[[ 431    9  339   10    2    0    2]
 [  58    2  265   11    1    0    1]
 [  61    1  596   29    8    1    0]
 [  77    2 1304   69   59   10    8]
 [  95    1 2365   54  117   72   80]
 [ 113    0 2144   15   69  151  272]
 [  45    0  304    1    1   25  116]]
              precision    recall  f1-score   support

           0       0.49      0.54      0.52       793
           1       0.13      0.01      0.01       338
           2       0.08      0.86      0.15       696
           3       0.37      0.05      0.08      1529
           4       0.46      0.04      0.08      2784
           5       0.58      0.05      0.10      2764
           6       0.24      0.24      0.24       492

    accuracy                           0.16      9396
   macro avg       0.34      0.25      0.17      9396
weighted avg       0.43      0.16      0.13      9396



In [109]:
X_train = data_train[input]
y_train = data_train[output]

In [110]:
for _ in range(10):
    inc.partial_fit(X_train, y_train, classes=classes)
    print('Score:', inc.score(X_test, y_test))

Score: 0.44785212722015694
Score: 0.33767038413878564
Score: 0.1873192895497728
Score: 0.3135068153655514
Score: 0.4774886410574143
Score: 0.21850475010326312
Score: 0.34675753820735233
Score: 0.4354605534902933
Score: 0.40850888062783974
Score: 0.3537794299876084


## MLP

In [111]:
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [112]:
mlp = MLPClassifier(hidden_layer_sizes=(32,32), activation='relu', solver='adam', max_iter=500)


In [113]:
mlp.fit(X_train_1,y_train_1)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.1289364230540701
HK3 : 0.30797321972002434
HK4 : 0.26744904261890057
HK5 : 0.28824273072060685
HK6 : 0.3378119001919386
HK7 : 0.33890374331550804


In [114]:
mlp.partial_fit(X_train_2, y_train_2)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.19786096256684493
HK3 : 0.411442483262325
HK4 : 0.31871525633106856
HK5 : 0.3116308470290771
HK6 : 0.362763915547025
HK7 : 0.3549465240641711


In [115]:
mlp.partial_fit(X_train_3, y_train_3)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.1592394533571004
HK3 : 0.443700547778454
HK4 : 0.3057442865966646
HK5 : 0.3192161820480405
HK6 : 0.39923224568138194
HK7 : 0.41109625668449196


In [116]:
mlp.partial_fit(X_train_4, y_train_4)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.18835412953060013
HK3 : 0.46865489957395007
HK4 : 0.3292155651636813
HK5 : 0.2724399494310999
HK6 : 0.3237364043506078
HK7 : 0.31885026737967914


In [117]:
mlp.partial_fit(X_train_5, y_train_5)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.3939393939393939
HK3 : 0.47717589774802194
HK4 : 0.3323038912909203
HK5 : 0.33438685208596713
HK6 : 0.41394753678822777
HK7 : 0.4391711229946524


In [118]:
mlp.partial_fit(X_train_6, y_train_6)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.44266191325014853
HK3 : 0.5027388922702374
HK4 : 0.5071031500926498
HK5 : 0.572692793931732
HK6 : 0.5150351887396033
HK7 : 0.44385026737967914


In [119]:
for _ in range(10):
    mlp.partial_fit(X_train, y_train)
    for i in range(2, 8):
        X_test = data_test[input][data_test["hocky_sx"]==i]
        y_test = data_test[output][data_test["hocky_sx"]==i]
        print("HK"+str(i), ":", mlp.score(X_test, y_test))
    print("============================================================")

HK2 : 0.3790849673202614
HK3 : 0.5076080340839927
HK4 : 0.4595429277331686
HK5 : 0.4469026548672566
HK6 : 0.4779270633397313
HK7 : 0.464572192513369
HK2 : 0.39334521687462864
HK3 : 0.505173463177115
HK4 : 0.4688079061148857
HK5 : 0.4393173198482933
HK6 : 0.45873320537428025
HK7 : 0.4699197860962567
HK2 : 0.39037433155080214
HK3 : 0.49482653682288497
HK4 : 0.46757257566399013
HK5 : 0.4525916561314791
HK6 : 0.46833013435700577
HK7 : 0.4659090909090909
HK2 : 0.42424242424242425
HK3 : 0.47839318320146074
HK4 : 0.5151327980234712
HK5 : 0.5625790139064475
HK6 : 0.4888035828534869
HK7 : 0.4318181818181818
HK2 : 0.37373737373737376
HK3 : 0.5100426049908704
HK4 : 0.5003088326127239
HK5 : 0.538558786346397
HK6 : 0.4964811260396673
HK7 : 0.4338235294117647
HK2 : 0.4177064765300059
HK3 : 0.519780888618381
HK4 : 0.4737492279184682
HK5 : 0.4879898862199747
HK6 : 0.490722968650032
HK7 : 0.4679144385026738
HK2 : 0.37967914438502676
HK3 : 0.5106512477175897
HK4 : 0.5114268066707844
HK5 : 0.527180783817