In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import cohen_kappa_score
from nltk.tokenize import WordPunctTokenizer
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer


Preprocessing

In [3]:
train_path = './train.csv'
train_data = pd.read_csv(train_path)

test_path = './test.csv'
test_data = pd.read_csv(test_path)

train_data.event_result = train_data.event_result.astype(str)
test_data.event_result = test_data.event_result.astype(str)

train_data = train_data[train_data['Primary code '] != 'EM']
train_data = train_data[train_data['Primary code '] != 'TD']

test_data = test_data[test_data['Primary code '] != 'EM']
test_data = test_data[test_data['Primary code '] != 'TD']


In [4]:
train_text = train_data["event_result"].to_list()
test_text = test_data["event_result"].to_list()

In [5]:
train_text = [x.encode("ascii", "ignore").decode() for x in train_text]
test_text = [x.encode("ascii", "ignore").decode() for x in test_text]

Count vec

In [6]:
vectorizer = CountVectorizer(ngram_range=(1,2),tokenizer = WordPunctTokenizer().tokenize, max_features=10000)
x_train = vectorizer.fit_transform(train_text)
x_valid = vectorizer.transform(test_text)

In [7]:
le = preprocessing.LabelEncoder()
le.fit(train_data['Primary code '])
train_labels = le.transform(train_data['Primary code '])
valid_labels = le.transform(test_data['Primary code '])

In [8]:
train_labels

array([0, 0, 5, ..., 7, 4, 7])

Gaussian Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import tree

In [10]:
gnb = GaussianNB()
gnb.fit(x_train.toarray(), train_labels)

y_pred = gnb.predict(x_valid.toarray())
print(classification_report(valid_labels, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.48      0.50        73
           1       0.23      0.60      0.33        65
           2       0.02      0.25      0.04         8
           3       0.24      0.34      0.28        94
           4       0.59      0.33      0.42       197
           5       0.67      0.55      0.60       187
           6       0.42      0.27      0.33       151
           7       0.66      0.60      0.63       458

    accuracy                           0.48      1233
   macro avg       0.42      0.43      0.39      1233
weighted avg       0.55      0.48      0.50      1233



SVM

In [11]:
lin_svm = svm.LinearSVC()
lin_svm.fit(x_train.toarray(), train_labels)

y_pred = lin_svm.predict(x_valid.toarray())
print(classification_report(valid_labels, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.58      0.57        73
           1       0.55      0.57      0.56        65
           2       0.00      0.00      0.00         8
           3       0.42      0.33      0.37        94
           4       0.60      0.55      0.58       197
           5       0.70      0.77      0.73       187
           6       0.66      0.69      0.68       151
           7       0.72      0.74      0.73       458

    accuracy                           0.66      1233
   macro avg       0.53      0.53      0.53      1233
weighted avg       0.65      0.66      0.65      1233





Random Forest

In [12]:
rf = RandomForestClassifier()
rf.fit(x_train.toarray(), train_labels)

y_pred = rf.predict(x_valid.toarray())
print(classification_report(valid_labels, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.48      0.58        73
           1       0.62      0.52      0.57        65
           2       0.00      0.00      0.00         8
           3       0.81      0.14      0.24        94
           4       0.63      0.56      0.60       197
           5       0.68      0.76      0.72       187
           6       0.76      0.62      0.69       151
           7       0.66      0.88      0.76       458

    accuracy                           0.67      1233
   macro avg       0.61      0.50      0.52      1233
weighted avg       0.68      0.67      0.65      1233



  _warn_prf(average, modifier, msg_start, len(result))


Passive Aggressive

In [13]:
pac = PassiveAggressiveClassifier()
pac.fit(x_train.toarray(), train_labels)

y_pred = pac.predict(x_valid.toarray())
print(classification_report(valid_labels, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.52      0.52        73
           1       0.56      0.52      0.54        65
           2       0.00      0.00      0.00         8
           3       0.40      0.41      0.41        94
           4       0.58      0.57      0.57       197
           5       0.69      0.76      0.73       187
           6       0.66      0.68      0.67       151
           7       0.72      0.69      0.71       458

    accuracy                           0.64      1233
   macro avg       0.52      0.52      0.52      1233
weighted avg       0.64      0.64      0.64      1233



Decision Tree

In [14]:


dt = tree.DecisionTreeClassifier()
dt.fit(x_train.toarray(), train_labels)

y_pred = dt.predict(x_valid.toarray())
print(classification_report(valid_labels, y_pred))


              precision    recall  f1-score   support

           0       0.46      0.40      0.43        73
           1       0.45      0.51      0.48        65
           2       0.00      0.00      0.00         8
           3       0.44      0.38      0.41        94
           4       0.58      0.52      0.55       197
           5       0.63      0.68      0.65       187
           6       0.71      0.67      0.69       151
           7       0.68      0.74      0.71       458

    accuracy                           0.62      1233
   macro avg       0.49      0.49      0.49      1233
weighted avg       0.61      0.62      0.62      1233

