In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import cohen_kappa_score
from nltk.tokenize import WordPunctTokenizer

In [None]:
data = pd.read_csv("cooked_all_sep.csv")

In [None]:
corpus = data["event_result"].to_list()

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2),tokenizer = WordPunctTokenizer().tokenize)
vectorizer.fit_transform(corpus)

In [None]:
accuracy = list()
kappa = list()

for i in range(10):
    train = pd.read_csv(f"train{i}.csv")
    test = pd.read_csv(f"test{i}.csv")

    X_train = vectorizer.transform(train["event_result"]).toarray()
    X_test = vectorizer.transform(test["event_result"]).toarray()

    Y_train = train["Primary code"].to_numpy()
    Y_test = test["Primary code"].to_numpy()
    
    clf = LogisticRegression(tol=1e-5,C=1,max_iter=150)
    clf.fit(X_train, Y_train)
    
    accuracy.append(clf.score(X_test,Y_test))
    kappa.append(cohen_kappa_score(clf.predict(X_test),Y_test))
    print("Fold Complete")
print(f"Accuracy {sum(accuracy)/len(accuracy)}")
print(f"Kappa {sum(kappa)/len(kappa)}")