# Step 1 - Loading the Required Libraries and Modules 

In [250]:
from sklearn.preprocessing import LabelEncoder
import dask
import dask.array as da
from dask_ml.datasets import make_classification
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from dask_ml.wrappers import Incremental



# Step 2 - Read data. Creating train, test

In [251]:
df = pd.read_csv('../result_data/final_v2.csv') 


In [252]:
le = LabelEncoder()
df['chuyennganh2'] = le.fit_transform(df['chuyennganh2'])

In [253]:
# Dữ liệu train gồm các khóa 8, 9, 10, 11, 12, 13
data_train = df[df["khoahoc"] < 14]
data_train1 = df[df["khoahoc"]==8]
data_train2 = df[df["khoahoc"]==9]
data_train3 = df[df["khoahoc"]==10]
data_train4 = df[df["khoahoc"]==11]
data_train5 = df[df["khoahoc"]==12]
data_train6 = df[df["khoahoc"]==13]

#  Dữ liệu test là khóa 14
data_test = df[df["khoahoc"]>=14]

print("Số mẫu train1:", data_train1.shape[0])
print("Số mẫu train2:", data_train2.shape[0])
print("Số mẫu train3:", data_train3.shape[0])
print("Số mẫu train4:", data_train4.shape[0])
print("Số mẫu train5:", data_train5.shape[0])
print("Số mẫu train6:", data_train6.shape[0])

print("Số mẫu test:", data_test.shape[0])
print("Tỉ lệ tập test: ", data_test.shape[0]/df.shape[0])

Số mẫu train1: 6888
Số mẫu train2: 7402
Số mẫu train3: 7671
Số mẫu train4: 7616
Số mẫu train5: 7528
Số mẫu train6: 9396
Số mẫu test: 9684
Tỉ lệ tập test:  0.17235917059713446


In [254]:
# Khu vực
khuvuc = [col for col in df.columns if col.startswith("khuvuc")]

# Khoa
khoa = [col for col in df.columns if col.startswith("khoa")]

# Hệ đào tạo
hedt = [col for col in df.columns if col.startswith("hedt")]

# Thuộc tính đầu vào
# input = ["gioitinh", "xl_tt", "xl_av", "khoahoc"] + khuvuc + khoa + hedt + ["dtbtl", "drltl"]
input = ["gioitinh", "xl_av", "khoahoc", "chuyennganh2", "xl_tt", "hocky_sx" , "sotchk", "dtbtl", "drltl", 'dtbhk_truoc', 'drlhk_truoc', 'sotctl', 'somon_khongdat_hktruoc' ] + khuvuc + khoa + hedt 
# Output
output = "xeploai"

In [255]:
for i in range(1, 7):
    exec(f'X_train_{i} = data_train{i}[input]')
    exec(f'y_train_{i} = data_train{i}[output]')


In [256]:
X_test = data_test[input]
y_test = data_test[output]

In [257]:
classes = da.unique(y_train_1).compute()
classes

array([0, 1, 2, 3], dtype=int64)

# Step3 - Building, Predicting, and Evaluating the models

## SGDClassifier

In [258]:
est = SGDClassifier(loss='log', penalty='l2', tol=1e-3)
inc = Incremental(est, scoring='accuracy')


In [259]:
inc.fit(X_train_1, y_train_1, classes=classes)




In [260]:
inc.score(X_test, y_test)

0.4728418009087154

In [261]:
inc.partial_fit(X_train_2, y_train_2, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.2385377942998761


In [262]:
inc.partial_fit(X_train_3, y_train_3, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.35749690210656754


In [263]:
inc.partial_fit(X_train_4, y_train_4, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.5204460966542751


In [264]:
inc.partial_fit(X_train_5, y_train_5, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.46902106567534074


In [265]:
inc.partial_fit(X_train_6, y_train_6, classes=classes)
print('Score:', inc.score(X_test, y_test))

Score: 0.35883932259396945


In [266]:
predict_train = inc.predict(X_train_6)
predict_test = inc.predict(X_test)

In [267]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_train_6,predict_train))
print(classification_report(y_train_6,predict_train))

[[ 239   27  847   18]
 [  76   13 2133    3]
 [  14    1 2719   50]
 [   2    1 2933  320]]
              precision    recall  f1-score   support

           0       0.72      0.21      0.33      1131
           1       0.31      0.01      0.01      2225
           2       0.31      0.98      0.48      2784
           3       0.82      0.10      0.18      3256

    accuracy                           0.35      9396
   macro avg       0.54      0.32      0.25      9396
weighted avg       0.54      0.35      0.24      9396



In [268]:
X_train = data_train[input]
y_train = data_train[output]

In [269]:
for _ in range(10):
    inc.partial_fit(X_train, y_train, classes=classes)
    print('Score:', inc.score(X_test, y_test))

Score: 0.345518380834366
Score: 0.2839735646427096
Score: 0.3524370095002065
Score: 0.494940107393639
Score: 0.4554935976869062
Score: 0.46788517141676994
Score: 0.3812474184221396
Score: 0.3344692275919042
Score: 0.5396530359355638
Score: 0.5326311441553078


## MLP

In [270]:
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score

In [271]:
mlp = MLPClassifier(hidden_layer_sizes=(32,32), activation='relu', solver='adam', max_iter=500)


In [272]:
mlp.fit(X_train_1,y_train_1)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.13725490196078433
HK3 : 0.09190505173463177
HK4 : 0.09697344039530574
HK5 : 0.08533501896333755
HK6 : 0.15355086372360843
HK7 : 0.16711229946524064


In [273]:
mlp.partial_fit(X_train_2, y_train_2)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.16042780748663102
HK3 : 0.17589774802191113
HK4 : 0.13218035824583077
HK5 : 0.10998735777496839
HK6 : 0.19001919385796545
HK7 : 0.19585561497326204


In [274]:
mlp.partial_fit(X_train_3, y_train_3)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.1830065359477124
HK3 : 0.23128423615337795
HK4 : 0.15873996294008647
HK5 : 0.1302149178255373
HK6 : 0.21369161868202174
HK7 : 0.2266042780748663


In [275]:
mlp.partial_fit(X_train_4, y_train_4)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.26024955436720143
HK3 : 0.3682288496652465
HK4 : 0.1865348980852378
HK5 : 0.1504424778761062
HK6 : 0.2565579014715291
HK7 : 0.28342245989304815


In [276]:
mlp.partial_fit(X_train_5, y_train_5)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.3701723113487819
HK3 : 0.5057821059038344
HK4 : 0.2291537986411365
HK5 : 0.21871049304677623
HK6 : 0.38259756877799106
HK7 : 0.42580213903743314


In [277]:
mlp.partial_fit(X_train_6, y_train_6)
for i in range(2, 8):
    X_test = data_test[input][data_test["hocky_sx"]==i]
    y_test = data_test[output][data_test["hocky_sx"]==i]
    print("HK"+str(i), ":", mlp.score(X_test, y_test))

HK2 : 0.47771836007130125
HK3 : 0.6135118685331711
HK4 : 0.4817788758492897
HK5 : 0.6011378002528445
HK6 : 0.582213691618682
HK7 : 0.5621657754010695


In [278]:
for _ in range(10):
    mlp.partial_fit(X_train, y_train)
    for i in range(2, 8):
        X_test = data_test[input][data_test["hocky_sx"]==i]
        y_test = data_test[output][data_test["hocky_sx"]==i]
        print("HK"+str(i), ":", mlp.score(X_test, y_test))
    print("============================================================")

HK2 : 0.37373737373737376
HK3 : 0.5015216068167986
HK4 : 0.47313156269302037
HK5 : 0.42920353982300885
HK6 : 0.4779270633397313
HK7 : 0.5661764705882353
HK2 : 0.40938799762329176
HK3 : 0.5502130249543518
HK4 : 0.4694255713403335
HK5 : 0.4835651074589128
HK6 : 0.5111964171465131
HK7 : 0.5661764705882353
HK2 : 0.4682115270350565
HK3 : 0.5684723067559343
HK4 : 0.47004323656578134
HK5 : 0.511378002528445
HK6 : 0.54510556621881
HK7 : 0.56951871657754
HK2 : 0.45573380867498514
HK3 : 0.5769933049300061
HK4 : 0.45521927115503397
HK5 : 0.4797724399494311
HK6 : 0.5348688419705694
HK7 : 0.5548128342245989
HK2 : 0.4979203802733215
HK3 : 0.5842970176506391
HK4 : 0.4657195799876467
HK5 : 0.47345132743362833
HK6 : 0.5252719129878439
HK7 : 0.571524064171123
HK2 : 0.4860368389780155
HK3 : 0.5642118076688983
HK4 : 0.4336009882643607
HK5 : 0.43173198482932995
HK6 : 0.5111964171465131
HK7 : 0.554144385026738
HK2 : 0.46048722519310753
HK3 : 0.595861229458308
HK4 : 0.4589252625077208
HK5 : 0.486093552465233