# Step 1 - Loading the Required Libraries and Modules 

In [163]:
from sklearn.preprocessing import LabelEncoder
import dask
import dask.array as da
from dask_ml.datasets import make_classification
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from dask_ml.wrappers import Incremental



# Step 2 - Read data. Creating train, test

In [164]:
df = pd.read_csv('../result_data/final_v2.csv') 


In [165]:
le = LabelEncoder()
df['chuyennganh2'] = le.fit_transform(df['chuyennganh2'])

In [166]:
# Dữ liệu train gồm các khóa 8, 9, 10, 11, 12, 13
data_train = df[df["khoahoc"] < 14]
data_train1 = df[df["khoahoc"]==8]
data_train2 = df[df["khoahoc"]==9]
data_train3 = df[df["khoahoc"]==10]
data_train4 = df[df["khoahoc"]==11]
data_train5 = df[df["khoahoc"]==12]
data_train6 = df[df["khoahoc"]==13]

#  Dữ liệu test là khóa 14
data_test = df[df["khoahoc"]>=14]

print("Số mẫu train1:", data_train1.shape[0])
print("Số mẫu train2:", data_train2.shape[0])
print("Số mẫu train3:", data_train3.shape[0])
print("Số mẫu train4:", data_train4.shape[0])
print("Số mẫu train5:", data_train5.shape[0])
print("Số mẫu train6:", data_train6.shape[0])

print("Số mẫu test:", data_test.shape[0])
print("Tỉ lệ tập test: ", data_test.shape[0]/df.shape[0])

Số mẫu train1: 6888
Số mẫu train2: 7402
Số mẫu train3: 7671
Số mẫu train4: 7616
Số mẫu train5: 7528
Số mẫu train6: 9396
Số mẫu test: 9684
Tỉ lệ tập test:  0.17235917059713446


In [167]:
# Khu vực
khuvuc = [col for col in df.columns if col.startswith("khuvuc")]

# Khoa
khoa = [col for col in df.columns if col.startswith("khoa")]

# Hệ đào tạo
hedt = [col for col in df.columns if col.startswith("hedt")]

# Thuộc tính đầu vào
# input = ["gioitinh", "xl_tt", "xl_av", "khoahoc"] + khuvuc + khoa + hedt + ["dtbtl", "drltl"]
input = ["gioitinh", "xl_av", "khoahoc", "chuyennganh2", "xl_tt", "hocky_sx" , "sotchk", "dtbtl", "drltl", 'dtbhk_truoc', 'drlhk_truoc', 'sotctl', 'somon_khongdat_hktruoc' ] + khuvuc + khoa + hedt 
# Output
output = "xeploai"

In [168]:
for i in range(1, 7):
    exec(f'X_train_{i} = data_train{i}[input]')
    exec(f'y_train_{i} = data_train{i}[output]')


In [169]:
X_test = data_test[input]
y_test = data_test[output]

In [170]:
classes = da.unique(y_train_1).compute()
classes

array([0, 1, 2, 3], dtype=int64)

# Step3 - Building, Predicting, and Evaluating the models

## SGDClassifier

In [171]:
est = SGDClassifier(loss='log', penalty='l2', tol=1e-3)
inc = Incremental(est, scoring='accuracy')


In [172]:
inc.fit(X_train_1, y_train_1, classes=classes)




In [173]:
inc.score(X_test, y_test)

0.40861214374225524

In [174]:
inc.partial_fit(X_train_2, y_train_2, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.45652622883106153


In [175]:
inc.partial_fit(X_train_3, y_train_3, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.4687112763320942


In [176]:
inc.partial_fit(X_train_4, y_train_4, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.38806278397356464


In [177]:
inc.partial_fit(X_train_5, y_train_5, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.508364312267658


In [178]:
inc.partial_fit(X_train_6, y_train_6, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.5765179677819083


In [179]:
predict_train = inc.predict(X_train_6)
predict_test = inc.predict(X_test)

In [180]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_train_6,predict_train))
print(classification_report(y_train_6,predict_train))

[[ 318  295  221  297]
 [ 137  527 1200  361]
 [  40  120 1267 1357]
 [  13   17  413 2813]]
              precision    recall  f1-score   support

           0       0.63      0.28      0.39      1131
           1       0.55      0.24      0.33      2225
           2       0.41      0.46      0.43      2784
           3       0.58      0.86      0.70      3256

    accuracy                           0.52      9396
   macro avg       0.54      0.46      0.46      9396
weighted avg       0.53      0.52      0.49      9396



In [181]:
X_train = data_train[input]
y_train = data_train[output]

In [182]:
for _ in range(10):
    inc.partial_fit(X_train, y_train, classes=classes)
    print('Score:', inc.score(X_test, y_test))

Score: 0.4882280049566295
Score: 0.49979347377116895
Score: 0.5054729450640232
Score: 0.209727385377943
Score: 0.45745559686080134
Score: 0.3768071045022718
Score: 0.4232755059892606
Score: 0.4771788517141677
Score: 0.330545229244114
Score: 0.2823213548120611


## MLP

In [184]:
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [186]:
mlp = MLPClassifier(hidden_layer_sizes=(32,32), activation='relu', solver='adam', max_iter=500)


In [188]:
mlp.fit(X_train_1,y_train_1)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.13725490196078433
HK3 : 0.10529519172245892
HK4 : 0.08400247066090179
HK5 : 0.06447534766118837
HK6 : 0.10684580934101087
HK7 : 0.11163101604278075


In [189]:
mlp.partial_fit(X_train_2, y_train_2, classes=classes)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.1414141414141414
HK3 : 0.16068167985392576
HK4 : 0.09635577516985794
HK5 : 0.08217446270543616
HK6 : 0.1599488163787588
HK7 : 0.17179144385026737


In [190]:
mlp.partial_fit(X_train_3, y_train_3, classes=classes)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.1568627450980392
HK3 : 0.22519780888618382
HK4 : 0.13279802347127856
HK5 : 0.11883691529709228
HK6 : 0.21369161868202174
HK7 : 0.22994652406417113


In [191]:
mlp.partial_fit(X_train_4, y_train_4, classes=classes)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.21152703505644682
HK3 : 0.3171028606208156
HK4 : 0.18468190240889437
HK5 : 0.15802781289506954
HK6 : 0.2476007677543186
HK7 : 0.27406417112299464


In [192]:
mlp.partial_fit(X_train_5, y_train_5, classes=classes)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.3018419489007724
HK3 : 0.3901399878271455
HK4 : 0.24397776405188387
HK5 : 0.20733249051833122
HK6 : 0.3582853486884197
HK7 : 0.45187165775401067


In [193]:
mlp.partial_fit(X_train_6, y_train_6, classes=classes)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.49732620320855614
HK3 : 0.6165550821667681
HK4 : 0.5330450895614577
HK5 : 0.6156763590391909
HK6 : 0.5674984005118362
HK7 : 0.5521390374331551


In [195]:
for _ in range(10):
    mlp.partial_fit(X_train, y_train)
    for i in range(2, 8):
        X_test = data_test[input][data_test["hocky_sx"]==i]
        y_test = data_test[output][data_test["hocky_sx"]==i]
        print("HK"+str(i), ":", mlp.score(X_test, y_test))
    print("============================================================")

HK2 : 0.5050505050505051
HK3 : 0.5976871576384662
HK4 : 0.48857319332921556
HK5 : 0.5088495575221239
HK6 : 0.5406269993602048
HK7 : 0.570855614973262
HK2 : 0.5442661913250149
HK3 : 0.6214242239805234
HK4 : 0.5083384805435454
HK5 : 0.5006321112515802
HK6 : 0.5303902751119641
HK7 : 0.5748663101604278
HK2 : 0.5121806298276886
HK3 : 0.5867315885575167
HK4 : 0.4416306361951822
HK5 : 0.42098609355246525
HK6 : 0.5182341650671785
HK7 : 0.5681818181818182
HK2 : 0.5543672014260249
HK3 : 0.62507608034084
HK4 : 0.5311920938851142
HK5 : 0.5663716814159292
HK6 : 0.5495841330774153
HK7 : 0.5655080213903744
HK2 : 0.5995246583481878
HK3 : 0.6244674376141205
HK4 : 0.5410747374922792
HK5 : 0.5587863463969659
HK6 : 0.5412667946257198
HK7 : 0.554144385026738
HK2 : 0.5739750445632799
HK3 : 0.628119293974437
HK4 : 0.5194564546016059
HK5 : 0.5436156763590392
HK6 : 0.5387076135636596
HK7 : 0.5648395721925134
HK2 : 0.5508021390374331
HK3 : 0.6262933657942787
HK4 : 0.5281037677578753
HK5 : 0.5594184576485461
HK6