# 라이브러리 임포트

In [32]:
import pandas as pd
import pymysql
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sqlalchemy import create_engine
import statistics

# excel file to MySQL

In [33]:
data = pd.read_excel("./db_score_3_labels.xlsx")

conn = pymysql.connect(host='localhost', user='root', password=",./l;'p[]a", db='university')
curs = conn.cursor(pymysql.cursors.DictCursor)

db_connection_str = "mysql+pymysql://root:,./l;'p[]a@localhost/university"
db_connection = create_engine(db_connection_str)
data.to_sql(name='db_score', con=db_connection, if_exists='replace', index=False)

curs.close()
conn.close()

In [34]:
# accuracy, precision, recall, f1_score 반환 함수
def classification_performance_eval(y_test, y_predict):
    tp, tn, fp, fn = 0, 0, 0, 0
    
    for y, yp in zip(y_test, y_predict):
        if y == 1 and yp == 1:
            tp += 1
        elif y == 1 and yp == -1:
            fn += 1
        elif y == -1 and yp == 1:
            fp += 1
        else:
            tn += 1
            
    print("tp:",tp, "tn:",tn, "fp:",fp, "fn:",fn)    

    # zero division 예외 처리
    accuracy = 0 if (tp+tn+fp+fn)==0 else (tp+tn)/(tp+tn+fp+fn)
    precision = 0 if (tp+fp)==0 else(tp)/(tp+fp)
    recall = 0 if (tp+fp)==0 else (tp)/(tp+fp)
    f1_score = 0 if (precision+recall)==0 else 2*precision*recall / (precision+recall)
        
    return accuracy, precision, recall, f1_score

# 이진 분류 (grade B or not)

In [35]:
conn = pymysql.connect(host='localhost', user='root', password=",./l;'p[]a", db='university')
curs = conn.cursor(pymysql.cursors.DictCursor)

sql = "select * from db_score"
curs.execute(sql)

data = curs.fetchall()

curs.close()
conn.close()

X = [(t['homework'], t['discussion'], t['final']) for t in data]
X = np.array(X)

# grade가 B이면 1 아니면 -1 (이진 분류)
y = [1 if (t['grade'] == 'B') else -1 for t in data]
y = np.array(y)

# train_test_split SVM

In [36]:
# train_test_split SVM

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
svm = SVC(kernel='rbf', C=1e4).fit(X_train, y_train)
y_predict = svm.predict(X_test)
print(y_predict)
    
acc, prec, rec, f1 = classification_performance_eval(y_test, y_predict)

print('accuray:', acc)
print('precision:', prec)
print('recall:', rec)
print('f1_score:', f1)

[ 1  1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1
  1 -1  1 -1 -1 -1 -1]
tp: 5 tn: 19 fp: 3 fn: 4
accuray: 0.7741935483870968
precision: 0.625
recall: 0.625
f1_score: 0.625


# train_test_split Logistic Regression

In [37]:
# train_test_split Logistic Regression

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.13, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lr = LogisticRegression(random_state=42, class_weight='balanced', C=1e2).fit(X_train, y_train)

y_predict = lr.predict(X_test)
print('y_predict:', y_predict)

acc, prec, rec, f1 = classification_performance_eval(y_test, y_predict)
print('accuray:', acc)
print('precision:', prec)
print('recall:', rec)
print('f1_score:', f1)

y_predict: [ 1  1  1 -1  1 -1 -1  1  1  1 -1  1]
tp: 4 tn: 3 fp: 4 fn: 1
accuray: 0.5833333333333334
precision: 0.5
recall: 0.5
f1_score: 0.5


# K-fold SVM

In [38]:
# K-fold SVM
kf = KFold(n_splits=5, random_state=42, shuffle=True)
accuracy = []
precision = []
recall = []
f1_score_ = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    svm = SVC(kernel='rbf', C=1e6).fit(X_train, y_train)
    y_predict = svm.predict(X_test)


    acc, prec, rec, f1 = classification_performance_eval(y_test, y_predict)
    accuracy.append(acc)
    precision.append(prec)
    recall.append(rec)
    f1_score_.append(f1)
        
print('accuray:', statistics.mean(accuracy))
print('precision:', statistics.mean(precision))
print('recall:', statistics.mean(recall))
print('f1_score:', statistics.mean(f1_score_))

tp: 4 tn: 10 fp: 3 fn: 2
tp: 4 tn: 8 fp: 4 fn: 3
tp: 1 tn: 13 fp: 2 fn: 2
tp: 2 tn: 8 fp: 0 fn: 8
tp: 3 tn: 7 fp: 6 fn: 2
accuray: 0.6514619883040935
precision: 0.5476190476190476
recall: 0.5476190476190476
f1_score: 0.5476190476190476


# K-fold Logistic Regression

In [39]:
# K-fold Logistic Regression

kf = KFold(n_splits=4, random_state=42, shuffle=True)
accuracy = []
precision = []
recall = []
f1_score_ = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    lr = LogisticRegression(random_state=42, class_weight='balanced', C=1).fit(X_train, y_train)
    y_predict= lr.predict(X_test)

    acc, prec, rec, f1 = classification_performance_eval(y_test, y_predict)
    accuracy.append(acc)
    precision.append(prec)
    recall.append(rec)
    f1_score_.append(f1)
    
print('accuray:', statistics.mean(accuracy))
print('precision:', statistics.mean(precision))
print('recall:', statistics.mean(recall))
print('f1_score:', statistics.mean(f1_score_))

tp: 4 tn: 8 fp: 9 fn: 2
tp: 6 tn: 6 fp: 9 fn: 2
tp: 2 tn: 7 fp: 7 fn: 7
tp: 5 tn: 5 fp: 10 fn: 3
accuray: 0.46739130434782605
precision: 0.3158119658119658
recall: 0.3158119658119658
f1_score: 0.3158119658119658


# 다중 클래스 분류(grade A, B, C)

In [40]:
conn = pymysql.connect(host='localhost', user='root', password=",./l;'p[]a", db='university')
curs = conn.cursor(pymysql.cursors.DictCursor)

sql = "select * from db_score"
curs.execute(sql)

data = curs.fetchall()

curs.close()
conn.close()

X = [(t['homework'], t['discussion'], t['final']) for t in data]
X = np.array(X)

y = [0 if (t['grade'] == 'A') else 1 if (t['grade'] == 'B') else 2 for t in data]
y = np.array(y)

# train_test_split SVM

In [41]:
# train_test_split SVM

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

svm = SVC(kernel='rbf', C=5).fit(X_train, y_train)
y_predict = svm.predict(X_test)
print(y_predict)
    
print('accuray:', accuracy_score(y_test, y_predict))
print('precision:', precision_score(y_test, y_predict, average=None))
print('recall:', recall_score(y_test, y_predict, average=None))
print('f1_score:', f1_score(y_test, y_predict, average=None))

[0 1 1 2 0 0 1 1 0 2 2 1 2 1 0 0 0 0 1 2 0 1 1 1 1 0 1 2 1 2 0]
accuray: 0.7419354838709677
precision: [0.90909091 0.53846154 0.85714286]
recall: [0.90909091 0.77777778 0.54545455]
f1_score: [0.90909091 0.63636364 0.66666667]


# train_test_split Logistic Regression

In [42]:
# train_test_split Logistic Regression

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

lr = LogisticRegression(random_state=42, class_weight='balanced', C=1e10).fit(X_train, y_train)

y_predict = lr.predict(X_test)
print('y_predict:', y_predict)

print('accuray:', accuracy_score(y_test, y_predict))
print('precision:', precision_score(y_test, y_predict, average=None))
print('recall:', recall_score(y_test, y_predict, average=None))
print('f1_score:', f1_score(y_test, y_predict, average=None))

y_predict: [1 1 1 2 0 0 0 2 0 2 2 1 2 1 0 0 0 1 1 2 0 1 2 1 0 0 1 2 1 2 0]
accuray: 0.7419354838709677
precision: [0.81818182 0.54545455 0.88888889]
recall: [0.81818182 0.66666667 0.72727273]
f1_score: [0.81818182 0.6        0.8       ]


# k-fold SVM

In [43]:
# k-fold SVM

kf = KFold(n_splits=5, random_state=42, shuffle=True)
accuracy = []
precision_A, precision_B, precision_C = [], [], []
recall_A, recall_B, recall_C = [], [], []
f1_score_A, f1_score_B, f1_score_C = [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    svm = SVC(kernel='rbf', C=1e10).fit(X_train, y_train)
    y_predict = svm.predict(X_test)

    prec = precision_score(y_test, y_predict, average=None, zero_division=0)
    rec = recall_score(y_test, y_predict, average=None, zero_division=0)
    f1 = f1_score(y_test, y_predict, average=None, zero_division=0)
    
    accuracy.append(accuracy_score(y_test, y_predict))
    precision_A.append(prec[0])
    precision_B.append(prec[1])
    precision_C.append(prec[2])
    recall_A.append(rec[0])
    recall_B.append(rec[1])
    recall_C.append(rec[2])
    f1_score_A.append(f1[0])
    f1_score_B.append(f1[1])
    f1_score_C.append(f1[2])
    
print('accuray:', statistics.mean(accuracy))
print('precision:', statistics.mean(precision_A), statistics.mean(precision_B), statistics.mean(precision_C) )
print('recall:', statistics.mean(recall_A), statistics.mean(recall_B), statistics.mean(recall_C))
print('f1_score:', statistics.mean(f1_score_A), statistics.mean(f1_score_B), statistics.mean(f1_score_C))

accuray: 0.543859649122807
precision: 0.5433333333333333 0.3904761904761905 0.6709956709956709
recall: 0.5457142857142857 0.3057142857142857 0.689047619047619
f1_score: 0.5167099567099568 0.32945054945054947 0.648180591338486


# k-fold Logistic Regression

In [44]:
# k-fold Logistic Regression

kf = KFold(n_splits=5, random_state=42, shuffle=True)
accuracy = []
precision_A, precision_B, precision_C = [], [], []
recall_A, recall_B, recall_C = [], [], []
f1_score_A, f1_score_B, f1_score_C = [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    lr = LogisticRegression(random_state=42, class_weight='balanced', C=1).fit(X_train, y_train)
    y_predict= lr.predict(X_test)

    prec = precision_score(y_test, y_predict, average=None, zero_division=0)
    rec = recall_score(y_test, y_predict, average=None, zero_division=0)
    f1 = f1_score(y_test, y_predict, average=None, zero_division=0)
    
    accuracy.append(accuracy_score(y_test, y_predict))
    precision_A.append(prec[0])
    precision_B.append(prec[1])
    precision_C.append(prec[2])
    recall_A.append(rec[0])
    recall_B.append(rec[1])
    recall_C.append(rec[2])
    f1_score_A.append(f1[0])
    f1_score_B.append(f1[1])
    f1_score_C.append(f1[2])
    
print('accuray:', statistics.mean(accuracy))
print('precision:', statistics.mean(precision_A), statistics.mean(precision_B), statistics.mean(precision_C) )
print('recall:', statistics.mean(recall_A), statistics.mean(recall_B), statistics.mean(recall_C))
print('f1_score:', statistics.mean(f1_score_A), statistics.mean(f1_score_B), statistics.mean(f1_score_C))

accuray: 0.6502923976608187
precision: 0.6742857142857143 0.6133333333333333 0.7452380952380953
recall: 0.5842857142857143 0.4676190476190476 0.8678571428571429
f1_score: 0.61 0.47883449883449886 0.7919413919413919
