In [None]:
import sklearn_crfsuite
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score
from nltk.tokenize import WordPunctTokenizer

In [None]:
data = pd.read_csv("cooked_all_sep.csv")

In [None]:
corpus = data["event_result"].to_list()
vectorizer = CountVectorizer(ngram_range=(1,2),tokenizer = WordPunctTokenizer().tokenize)
vectorizer.fit_transform(corpus)

In [None]:
def reformat(section):
    reformat_data = list()
    for id in section["team_id"].unique():
        temp = list()
        for index, row in section[section["team_id"] == id].iterrows():
            temp.append((row["event_result"],row["Primary code"]))
        reformat_data.append(temp)
    return reformat_data

In [None]:
def sent2labels(sent):
    return [str(label) for sentence, label in sent]

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    
    # uni+bigram feature for current sentence
    features = {i:0 for i in vectorizer.get_feature_names()}
    temp = CountVectorizer(ngram_range=(1,2),tokenizer = WordPunctTokenizer().tokenize)
    temp.fit_transform([word])

    
    for grams in temp.get_feature_names():
        features[grams] = 1
    
    # previous sentence label
    if i > 0:
        label = sent[i-1][1]
        features.update({"-1label": label})
    
    
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [None]:
accuracy = list()
kappa = list()

for i in range(10):
    train = pd.read_csv(f"train{i}.csv")
    test = pd.read_csv(f"test{i}.csv")

    re_train = reformat(train)
    re_test = reformat(test)

    X_train = [sent2features(s) for s in re_train]
    Y_train = [sent2labels(s) for s in re_train]

    X_test = [sent2features(s) for s in re_test]
    Y_test = [sent2labels(s) for s in re_test]
    
    clf = sklearn_crfsuite.CRF( c1=0.35, c2=0, max_iterations=150)
    clf.fit(X_train, Y_train)
    
    
    
    accuracy.append(clf.score(X_test,Y_test))
    
    pred = list()
    true = list()
    
    for i in clf.predict(X_test):
        for j in i:
            pred.append(j)
    for i in Y_test:
        for j in i:
            true.append(j)
    
    kappa.append(cohen_kappa_score(pred,true))
    
    print("Fold Complete")
    
print(f"Accuracy {sum(accuracy)/len(accuracy)}")
print(f"Kappa {sum(kappa)/len(kappa)}")