# Machine Learning methods (RF, SVM, NB, RF)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np


def ML(method, X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L):
    
    #Create an accuracy matrix. row: training (K1, K2, K3, L) / col: testing (K1, K2, K3, L)
    acc = np.zeros((4,4))
    
    #Create a Classifier
    if method=='RF': clf = RandomForestClassifier(n_estimators=100)
    if method=='SVM':clf = SVC(kernel='rbf', C = 1) # Gaussian Kernel
    if method=='NB': clf = MultinomialNB()
    if method=='LR': clf = LogisticRegression(max_iter=10000000)
    
    print("========== Method:", method,"==========")
    
    
    #### Train on Kaggle1 ####

    #Train the model using the training sets
    clf.fit(X_train, y_train)
    
    # Record accuracy
    acc[0,:] = [metrics.accuracy_score(y_test_K1, clf.predict(X_test_K1)),
                metrics.accuracy_score(y_test_K2, clf.predict(X_test_K2)),
                metrics.accuracy_score(y_test_K3, clf.predict(X_test_K3)),
                metrics.accuracy_score(y_test_L,  clf.predict(X_test_L ))]

    # Model Accuracy: how often is the classifier correct?
    print("----- Trained on Kaggle1 -----")
    print("Accuracy on Kaggle1:", acc[0,0])
    print("Accuracy on Kaggle2:", acc[0,1])
    print("Accuracy on Kaggle3:", acc[0,2])
    print("Accuracy on LIAR:",    acc[0,3])
    print("")
    

    #### Train on Kaggle2 ####

    #Train the model using the K2 dataset
    clf.fit(X_test_K2, y_test_K2)

    # Record accuracy
    acc[1,:] = [metrics.accuracy_score(y_test_K1, clf.predict(X_test_K1)),
                metrics.accuracy_score(y_test_K2, clf.predict(X_test_K2)),
                metrics.accuracy_score(y_test_K3, clf.predict(X_test_K3)),
                metrics.accuracy_score(y_test_L,  clf.predict(X_test_L ))]

    # Model Accuracy: how often is the classifier correct?
    print("----- Trained on Kaggle2 -----")
    print("Accuracy on Kaggle1:", acc[1,0])
    print("Accuracy on Kaggle2:", acc[1,1])
    print("Accuracy on Kaggle3:", acc[1,2])
    print("Accuracy on LIAR:",    acc[1,3])
    print("")

    #### Train on Kaggle3 ####

    #Train the model using the K3 dataset
    clf.fit(X_test_K3, y_test_K3)

    # Record accuracy
    acc[2,:] = [metrics.accuracy_score(y_test_K1, clf.predict(X_test_K1)),
                metrics.accuracy_score(y_test_K2, clf.predict(X_test_K2)),
                metrics.accuracy_score(y_test_K3, clf.predict(X_test_K3)),
                metrics.accuracy_score(y_test_L,  clf.predict(X_test_L ))]

    # Model Accuracy: how often is the classifier correct?
    print("----- Trained on Kaggle3 -----")
    print("Accuracy on Kaggle1:", acc[2,0])
    print("Accuracy on Kaggle2:", acc[2,1])
    print("Accuracy on Kaggle3:", acc[2,2])
    print("Accuracy on LIAR:",    acc[2,3])
    print("")


    #### Train on LIAR ####

    #Train the model using the L dataset
    clf.fit(X_test_L, y_test_L)

    # Record accuracy
    acc[3,:] = [metrics.accuracy_score(y_test_K1, clf.predict(X_test_K1)),
                metrics.accuracy_score(y_test_K2, clf.predict(X_test_K2)),
                metrics.accuracy_score(y_test_K3, clf.predict(X_test_K3)),
                metrics.accuracy_score(y_test_L,  clf.predict(X_test_L ))]

    # Model Accuracy: how often is the classifier correct?
    print("----- Trained on LIAR -----")
    print("Accuracy on Kaggle1:", acc[3,0])
    print("Accuracy on Kaggle2:", acc[3,1])
    print("Accuracy on Kaggle3:", acc[3,2])
    print("Accuracy on LIAR:",    acc[3,3])
    print("")


In [None]:
# ======================================= #
#    2. BOW features (NB, LR, SVM, RF)    #
# ======================================= #

# To load objects
import os
import pickle
os.chdir('C:/Users/Chanhwa/OneDrive - University of North Carolina at Chapel Hill/2021/FA21/COMP755/Final Project/data/bow')

print('------------------------------------')
print('------------------------------------')
print('           BOW features             ')
print('------------------------------------')
print('------------------------------------')

with open('train_BOW.p', 'rb') as file:
    X_train = pickle.load(file)
    y_train = pickle.load(file)
    wordlist = pickle.load(file)

with open('test_K1_BOW.p', 'rb') as file:
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K2_BOW.p', 'rb') as file:
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K3_BOW.p', 'rb') as file:
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_L_BOW.p', 'rb') as file:
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)
    wordlist = pickle.load(file)

# Fit SVM models
ML('RF', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('SVM', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('NB', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('LR', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)

------------------------------------
------------------------------------
           BOW features             
------------------------------------
------------------------------------
----- Trained on Kaggle1 -----
Accuracy on Kaggle1: 0.9781725040369731
Accuracy on Kaggle2: 0.4390625
Accuracy on Kaggle3: 0.7470690945372911
Accuracy on LIAR: 0.4415084977049994

----- Trained on Kaggle2 -----
Accuracy on Kaggle1: 0.5270337992093101
Accuracy on Kaggle2: 0.99658203125
Accuracy on Kaggle3: 0.5173359940134697
Accuracy on LIAR: 0.92792457511475

----- Trained on Kaggle3 -----
Accuracy on Kaggle1: 0.5292610947157415
Accuracy on Kaggle2: 0.5337890625
Accuracy on Kaggle3: 0.999002244948865
Accuracy on LIAR: 0.5385187941942687

----- Trained on LIAR -----
Accuracy on Kaggle1: 0.5310986135085473
Accuracy on Kaggle2: 0.83544921875
Accuracy on Kaggle3: 0.44200548765278125
Accuracy on LIAR: 0.9980151345986851

----- Trained on Kaggle1 -----
Accuracy on Kaggle1: 0.9686508157469792
Accuracy on Kaggle

In [None]:
# ======================================= #
#    3. TF-IDF features (NB, LR, SVM, RF) #
# ======================================= #

# To load objects
import os
import pickle
os.chdir('C:/Users/Chanhwa/OneDrive - University of North Carolina at Chapel Hill/2021/FA21/COMP755/Final Project/data/tfidf')

print('------------------------------------')
print('------------------------------------')
print('         TF-IDF features            ')
print('------------------------------------')
print('------------------------------------')

with open('train_TF_IDF.p', 'rb') as file:
    X_train = pickle.load(file)
    y_train = pickle.load(file)
    wordlist = pickle.load(file)

with open('test_K1_TF_IDF.p', 'rb') as file:
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K2_TF_IDF.p', 'rb') as file:
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K3_TF_IDF.p', 'rb') as file:
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_L_TF_IDF.p', 'rb') as file:
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)
    wordlist = pickle.load(file)


# Fit SVM models
ML('RF', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('SVM', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('NB', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('LR', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)

------------------------------------
------------------------------------
         TF-IDF features            
------------------------------------
------------------------------------
----- Trained on Kaggle1 -----
Accuracy on Kaggle1: 0.9770588562837574
Accuracy on Kaggle2: 0.4408203125
Accuracy on Kaggle3: 0.7465702170117237
Accuracy on LIAR: 0.4429971467559856

----- Trained on Kaggle2 -----
Accuracy on Kaggle1: 0.5089370232195557
Accuracy on Kaggle2: 0.99658203125
Accuracy on Kaggle3: 0.5520079820404091
Accuracy on LIAR: 0.9260637638010173

----- Trained on Kaggle3 -----
Accuracy on Kaggle1: 0.5452419399743861
Accuracy on Kaggle2: 0.558984375
Accuracy on Kaggle3: 0.999002244948865
Accuracy on LIAR: 0.5632055576231236

----- Trained on LIAR -----
Accuracy on Kaggle1: 0.51422685004733
Accuracy on Kaggle2: 0.8408203125
Accuracy on Kaggle3: 0.5457720129708157
Accuracy on LIAR: 0.9980151345986851

----- Trained on Kaggle1 -----
Accuracy on Kaggle1: 0.9760565733058634
Accuracy on Kaggle

In [None]:
# ======================================= #
#    4. Bigram features (NB, LR, SVM, RF) #
# ======================================= #

# To load objects
import os
import pickle
os.chdir('C:/Users/Chanhwa/OneDrive - University of North Carolina at Chapel Hill/2021/FA21/COMP755/Final Project/data/bigram')

print('------------------------------------')
print('------------------------------------')
print('         bigram features            ')
print('------------------------------------')
print('------------------------------------')

with open('train_BIGRAM.p', 'rb') as file:
    X_train = pickle.load(file)
    y_train = pickle.load(file)
    wordlist = pickle.load(file)

with open('test_K1_BIGRAM.p', 'rb') as file:
    X_test_K1 = pickle.load(file)
    y_test_K1 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K2_BIGRAM.p', 'rb') as file:
    X_test_K2 = pickle.load(file)
    y_test_K2 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_K3_BIGRAM.p', 'rb') as file:
    X_test_K3 = pickle.load(file)
    y_test_K3 = pickle.load(file)
    wordlist  = pickle.load(file)

with open('test_L_BIGRAM.p', 'rb') as file:
    X_test_L = pickle.load(file)
    y_test_L = pickle.load(file)
    wordlist = pickle.load(file)

# Fit SVM models
ML('RF', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('SVM', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('NB', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)
ML('LR', X_train, X_test_K1, X_test_K2, X_test_K3, X_test_L, y_train, y_test_K1, y_test_K2, y_test_K3, y_test_L)

------------------------------------
------------------------------------
         bigram features            
------------------------------------
------------------------------------
----- Trained on Kaggle1 -----
Accuracy on Kaggle1: 0.9698201458878557
Accuracy on Kaggle2: 0.43837890625
Accuracy on Kaggle3: 0.7405836867049139
Accuracy on LIAR: 0.4413844436174172

----- Trained on Kaggle2 -----
Accuracy on Kaggle1: 0.4935130018375188
Accuracy on Kaggle2: 0.8580078125
Accuracy on Kaggle3: 0.4479920179595909
Accuracy on LIAR: 0.7985361617665302

----- Trained on Kaggle3 -----
Accuracy on Kaggle1: 0.5400077955342725
Accuracy on Kaggle2: 0.52421875
Accuracy on Kaggle3: 0.9980044898977302
Accuracy on LIAR: 0.5279741967497829

----- Trained on LIAR -----
Accuracy on Kaggle1: 0.5392839244946823
Accuracy on Kaggle2: 0.73564453125
Accuracy on Kaggle3: 0.519830381641307
Accuracy on LIAR: 0.8621759086961915

----- Trained on Kaggle1 -----
Accuracy on Kaggle1: 0.9680383094827106
Accuracy on Kagg