In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

In [2]:
data = pd.read_csv("cooked_all_sep.csv")

In [3]:
data

Unnamed: 0.1,Unnamed: 0,task,team_id,Person-ID (mellon-ID),event_result,Primary code
0,0,apartment,179706_162850_162936_182908,162850,Hi,0
1,1,apartment,179706_162850_162936_182908,182908,hey,0
2,2,apartment,179706_162850_162936_182908,162936,Hey,0
3,3,apartment,179706_162850_162936_182908,179706,hola,0
4,4,apartment,179706_162850_162936_182908,162850,Is everyone here?,7
...,...,...,...,...,...,...
6170,6170,Professor,201620_174788_194314_175043,201620,have^,2
6171,6171,Professor,201620_174788_194314_175043,194314,BCA it is,1
6172,6172,Professor,201620_174788_194314_175043,174788,the count down just stops at 00:00 : ),1
6173,6173,Professor,201620_174788_194314_175043,194314,Oh haha,2


In [4]:
label0 = ' '.join(data[data['Primary code'] == 0]['event_result'])
label1 = ' '.join(data[data['Primary code'] == 1]['event_result'])
label2 = ' '.join(data[data['Primary code'] == 2]['event_result'])
label3 = ' '.join(data[data['Primary code'] == 3]['event_result'])
label4 = ' '.join(data[data['Primary code'] == 4]['event_result'])
label5 = ' '.join(data[data['Primary code'] == 5]['event_result'])
label6 = ' '.join(data[data['Primary code'] == 6]['event_result'])
label7 = ' '.join(data[data['Primary code'] == 7]['event_result'])

In [5]:
tfidf = TfidfVectorizer(sublinear_tf=True, stop_words='english')

In [6]:
tfidf.fit([label0,label1,label2,label3,label4,label5,label6,label7])

TfidfVectorizer(stop_words='english', sublinear_tf=True)

In [7]:
X = data["event_result"]

In [8]:
X

0                                           Hi
1                                          hey
2                                          Hey
3                                         hola
4                            Is everyone here?
                         ...                  
6170                                     have^
6171                                 BCA it is
6172    the count down just stops at 00:00 : )
6173                                   Oh haha
6174                                  wow haha
Name: event_result, Length: 6175, dtype: object

In [9]:
Y = data["Primary code"]

In [10]:
Y

0       0
1       0
2       0
3       0
4       7
       ..
6170    2
6171    1
6172    1
6173    2
6174    0
Name: Primary code, Length: 6175, dtype: int64

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)

# NB

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=1e-10)
clf.fit(tfidf.transform(X_train).toarray(),Y_train)

MultinomialNB(alpha=1e-10)

In [13]:
from sklearn.metrics import classification_report

y_pred = clf.predict(tfidf.transform(X_test).toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.62      0.71       196
           1       0.60      0.72      0.65       478
           2       0.50      0.48      0.49       189
           3       0.47      0.47      0.47       151
           4       0.34      0.29      0.31        91
           5       0.00      0.00      0.00         5
           6       0.53      0.40      0.46        70
           7       0.39      0.31      0.34        55

    accuracy                           0.56      1235
   macro avg       0.46      0.41      0.43      1235
weighted avg       0.57      0.56      0.56      1235



In [14]:
accuracy = list()
kappa = list()

for i in range(10):
    train = pd.read_csv(f"train{i}.csv")
    test = pd.read_csv(f"test{i}.csv")

    
    
    X_train = tfidf.transform(train["event_result"]).toarray()
    X_test = tfidf.transform(test["event_result"]).toarray()

    Y_train = train["Primary code"].to_numpy()
    Y_test = test["Primary code"].to_numpy()
    
    clf = MultinomialNB(alpha=1e-10)
    clf.fit(X_train, Y_train)
    
    accuracy.append(clf.score(X_test,Y_test))
    kappa.append(cohen_kappa_score(clf.predict(X_test),Y_test))
    print("Fold Complete")
print(f"Accuracy {sum(accuracy)/len(accuracy)}")
print(f"Kappa {sum(kappa)/len(kappa)}")

Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Accuracy 0.5464099679469728
Kappa 0.40695936805350186


# Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
clf = RandomForestClassifier(max_depth=70,min_samples_split=5)
clf.fit(tfidf.transform(X_train).toarray(),Y_train)

RandomForestClassifier(max_depth=70, min_samples_split=5)

In [20]:
y_pred = clf.predict(tfidf.transform(X_test).toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.44      0.59       194
           1       0.48      0.96      0.64       452
           2       0.79      0.32      0.45       197
           3       0.72      0.54      0.62       145
           4       0.50      0.01      0.02       103
           5       0.00      0.00      0.00         6
           6       0.77      0.38      0.50        72
           7       0.90      0.14      0.24        66

    accuracy                           0.57      1235
   macro avg       0.63      0.35      0.38      1235
weighted avg       0.66      0.57      0.52      1235



In [21]:
accuracy = list()
kappa = list()

for i in range(10):
    train = pd.read_csv(f"train{i}.csv")
    test = pd.read_csv(f"test{i}.csv")

    X_train = tfidf.transform(train["event_result"]).toarray()
    X_test = tfidf.transform(test["event_result"]).toarray()

    Y_train = train["Primary code"].to_numpy()
    Y_test = test["Primary code"].to_numpy()
    
    clf = RandomForestClassifier(max_depth=70,min_samples_split=5)
    clf.fit(X_train, Y_train)
    
    accuracy.append(clf.score(X_test,Y_test))
    kappa.append(cohen_kappa_score(clf.predict(X_test),Y_test))
    print("Fold Complete")
print(f"Accuracy {sum(accuracy)/len(accuracy)}")
print(f"Kappa {sum(kappa)/len(kappa)}")

Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Accuracy 0.5541905919460661
Kappa 0.3593890026633581


# Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
clf = DecisionTreeClassifier(min_samples_split=10)
clf.fit(tfidf.transform(X_train).toarray(),Y_train)
y_pred = clf.predict(tfidf.transform(X_test).toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.54      0.59       176
           1       0.67      0.68      0.67       499
           2       0.54      0.38      0.45       197
           3       0.31      0.65      0.42       140
           4       0.28      0.24      0.26        88
           5       0.00      0.00      0.00         4
           6       0.65      0.49      0.56        72
           7       0.47      0.12      0.19        59

    accuracy                           0.54      1235
   macro avg       0.45      0.39      0.39      1235
weighted avg       0.56      0.54      0.53      1235



In [24]:
accuracy = list()
kappa = list()

for i in range(10):
    train = pd.read_csv(f"train{i}.csv")
    test = pd.read_csv(f"test{i}.csv")

    X_train = tfidf.transform(train["event_result"]).toarray()
    X_test = tfidf.transform(test["event_result"]).toarray()

    Y_train = train["Primary code"].to_numpy()
    Y_test = test["Primary code"].to_numpy()
    
    clf = DecisionTreeClassifier(min_samples_split=10)
    clf.fit(X_train, Y_train)
    
    accuracy.append(clf.score(X_test,Y_test))
    kappa.append(cohen_kappa_score(clf.predict(X_test),Y_test))
    print("Fold Complete")
print(f"Accuracy {sum(accuracy)/len(accuracy)}")
print(f"Kappa {sum(kappa)/len(kappa)}")

Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Accuracy 0.5328525262790527
Kappa 0.40123578631138407


# LR

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
clf = LogisticRegression(tol=1e-5,C=1,max_iter=150)
clf.fit(tfidf.transform(X_train).toarray(),Y_train)
y_pred = clf.predict(tfidf.transform(X_test).toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.59      0.69       191
           1       0.53      0.90      0.66       462
           2       0.55      0.37      0.44       209
           3       0.69      0.48      0.56       143
           4       0.53      0.09      0.15       104
           5       0.00      0.00      0.00         2
           6       0.62      0.37      0.46        68
           7       0.65      0.20      0.30        56

    accuracy                           0.58      1235
   macro avg       0.55      0.37      0.41      1235
weighted avg       0.61      0.58      0.55      1235



  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
accuracy = list()
kappa = list()

for i in range(10):
    train = pd.read_csv(f"train{i}.csv")
    test = pd.read_csv(f"test{i}.csv")

    X_train = tfidf.transform(train["event_result"]).toarray()
    X_test = tfidf.transform(test["event_result"]).toarray()

    Y_train = train["Primary code"].to_numpy()
    Y_test = test["Primary code"].to_numpy()
    
    clf = LogisticRegression(tol=1e-5,C=1,max_iter=150)
    clf.fit(X_train, Y_train)
    
    accuracy.append(clf.score(X_test,Y_test))
    kappa.append(cohen_kappa_score(clf.predict(X_test),Y_test))
    print("Fold Complete")
print(f"Accuracy {sum(accuracy)/len(accuracy)}")
print(f"Kappa {sum(kappa)/len(kappa)}")

Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Accuracy 0.5889028728773134
Kappa 0.4278328670347773


# SVM

In [28]:
from sklearn.svm import LinearSVC

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
clf = LinearSVC(tol=1e-5,C=0.1,max_iter=2000)
clf.fit(tfidf.transform(X_train).toarray(),Y_train)
y_pred = clf.predict(tfidf.transform(X_test).toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.64      0.72       190
           1       0.55      0.92      0.69       458
           2       0.63      0.45      0.52       211
           3       0.67      0.56      0.61       139
           4       0.60      0.03      0.06        98
           5       0.00      0.00      0.00         3
           6       0.63      0.35      0.45        69
           7       0.29      0.03      0.05        67

    accuracy                           0.60      1235
   macro avg       0.52      0.37      0.39      1235
weighted avg       0.61      0.60      0.56      1235



  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
accuracy = list()
kappa = list()

for i in range(10):
    train = pd.read_csv(f"train{i}.csv")
    test = pd.read_csv(f"test{i}.csv")

    X_train = tfidf.transform(train["event_result"]).toarray()
    X_test = tfidf.transform(test["event_result"]).toarray()

    Y_train = train["Primary code"].to_numpy()
    Y_test = test["Primary code"].to_numpy()
    
    clf = LinearSVC(tol=1e-5,C=0.1,max_iter=2000)
    clf.fit(X_train, Y_train)
    
    accuracy.append(clf.score(X_test,Y_test))
    kappa.append(cohen_kappa_score(clf.predict(X_test),Y_test))
    print("Fold Complete")
print(f"Accuracy {sum(accuracy)/len(accuracy)}")
print(f"Kappa {sum(kappa)/len(kappa)}")

Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Accuracy 0.589376928833882
Kappa 0.42837814826881554


# Passive Agressive Classifier

In [31]:
from sklearn.linear_model import PassiveAggressiveClassifier

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
clf = PassiveAggressiveClassifier()
clf.fit(tfidf.transform(X_train).toarray(),Y_train)
y_pred = clf.predict(tfidf.transform(X_test).toarray())
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.62      0.70       184
           1       0.54      0.70      0.61       478
           2       0.56      0.36      0.44       192
           3       0.73      0.45      0.56       150
           4       0.27      0.21      0.24       109
           5       0.00      0.00      0.00         6
           6       0.60      0.43      0.50        68
           7       0.18      0.42      0.25        48

    accuracy                           0.53      1235
   macro avg       0.46      0.40      0.41      1235
weighted avg       0.57      0.53      0.53      1235



In [33]:
accuracy = list()
kappa = list()

for i in range(10):
    train = pd.read_csv(f"train{i}.csv")
    test = pd.read_csv(f"test{i}.csv")

    X_train = tfidf.transform(train["event_result"]).toarray()
    X_test = tfidf.transform(test["event_result"]).toarray()

    Y_train = train["Primary code"].to_numpy()
    Y_test = test["Primary code"].to_numpy()
    
    clf = PassiveAggressiveClassifier()
    clf.fit(X_train, Y_train)
    
    accuracy.append(clf.score(X_test,Y_test))
    kappa.append(cohen_kappa_score(clf.predict(X_test),Y_test))
    print("Fold Complete")
print(f"Accuracy {sum(accuracy)/len(accuracy)}")
print(f"Kappa {sum(kappa)/len(kappa)}")

Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Fold Complete
Accuracy 0.5438781984727215
Kappa 0.40886635429413926
