In [1]:
def accuracy_ratio(y, predictions):
    zipped = zip(y, predictions)
    total_points = 0
    for curr in zipped:
        if curr[0] == curr[1]:
            total_points += 2
        elif curr[0] == curr[1]+1 or curr[0] == curr[1]-1:
            total_points += 1
        else:
            total_points += 0

    return (total_points/(len(y)*2))

In [2]:
def tokenize_remove_punctuation(input):
    input = input.lower()
    input = word_tokenize(input)
    input = list(filter(lambda token: token not in string.punctuation, input))
    return input

In [3]:
import pandas as pd
import numpy as np

# import word embedding packages
from nltk.tokenize import word_tokenize, RegexpTokenizer
import gensim
from gensim.models import Word2Vec
import gensim.downloader
import string
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [4]:
# data loading
df = pd.read_csv('data/Challenge1_Training_Scenarios.csv')
df.set_index('scenario_id', inplace=True)

In [5]:
# grab X and y values and split 80/20 train/test
X = df['scenario']
y = df['danger_level']

train_X = X[:404]
train_y = y[:404]

test_X = X[404:]
test_y = y[404:]

In [6]:
# create bows
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

vect = TfidfVectorizer(stop_words=stopwords.words('english'), ngram_range=(1,3))
train_counts = vect.fit_transform(train_X)
test_counts = vect.transform(test_X)

In [7]:
from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()
clf_mnb.fit(train_counts, train_y)

pred = clf_mnb.predict(test_counts)
print(accuracy_ratio(test_y, pred))

0.4


In [8]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(train_counts, train_y)

pred = np.rint(clf_lr.predict(test_counts))
print(accuracy_ratio(test_y, pred))

0.435


In [9]:
from sklearn.neighbors import KNeighborsClassifier
clf_kn = KNeighborsClassifier()
clf_kn.fit(train_counts, train_y)

pred = clf_kn.predict(test_counts)
print(accuracy_ratio(test_y, pred))

0.37


In [10]:
from sklearn.svm import SVC
clf_svc = SVC()
clf_svc.fit(train_counts, train_y)

pred = clf_svc.predict(test_counts)
print(accuracy_ratio(test_y, pred))

0.335


In [11]:
# VISUALIZE DATA STUFF
visualize = pd.DataFrame(list(zip(list(test_X), test_y, list(pred))))
visualize['diff'] = list(np.subtract(test_y, pred))
visualize = visualize.sort_values(by = 'diff')
pd.set_option("display.max_rows", None)
visualize[0].loc[visualize['diff'] == 3]
visualize[0][47]

'The front desk nurse at a hospital in central Texas checks in a man who has a confirmed case of COVID-19. She keeps 6 feet of distance from him and wears a surgical mask. '

In [12]:
# 2nd approach
# word2vec to embed inside/outside
# linear regression matrix to get confidence levels of specific words
# stemming
# lemmatization
# stopwords remove in/out etc

In [14]:
# grab embeddings for valued words and format
valued_negative_words = ['inside', 'crowd', 'touch', 'alcohol', 'kids', 'travel', 'airplane', 'elderly', 'illness', 'group', 'sick', 'coronavirus']

for i, word in enumerate(valued_negative_words):
    valued_negative_words[i] = glove_vectors[word]

In [15]:
# transform to embeddings for each entry
train_embedded_X = train_X.copy()
for i, entry in enumerate(train_embedded_X, start=1):
    train_embedded_X[i] = tokenize_remove_punctuation(train_embedded_X[i])

    if 'covid-19' in train_embedded_X[i]:
        train_embedded_X[i][train_embedded_X[i].index('covid-19')] = 'coronavirus'
    if 'covid' in train_embedded_X[i]:
        train_embedded_X[i][train_embedded_X[i].index('covid')] = 'coronavirus'

    for j, word in enumerate(train_embedded_X[i]):
        if train_embedded_X[i][j] in glove_vectors:
            train_embedded_X[i][j] = glove_vectors[train_embedded_X[i][j]]
        else:
            train_embedded_X[i][j] = None

            
test_embedded_X = test_X.copy()
for i, entry in enumerate(test_embedded_X, start=405):
    test_embedded_X[i] = tokenize_remove_punctuation(test_embedded_X[i])
    
    if 'covid-19' in test_embedded_X[i]:
        test_embedded_X[i][test_embedded_X[i].index('covid-19')] = 'coronavirus'
    if 'covid' in test_embedded_X[i]:
        test_embedded_X[i][test_embedded_X[i].index('covid')] = 'coronavirus'
    
    for j, word in enumerate(test_embedded_X[i]):
        if test_embedded_X[i][j] in glove_vectors:
            test_embedded_X[i][j] = glove_vectors[test_embedded_X[i][j]]
        else:
            test_embedded_X[i][j] = None

In [43]:
# vector of closest distances to each valued word approach
for i, entry in enumerate(train_embedded_X, start=1):
    curr_min_distance_vec = np.full(len(valued_negative_words), float('inf'))

    # for each word
    for j, word in enumerate(train_embedded_X[i]):
        if word is not None:
            # loop through valued words
            for k, valued_word in enumerate(valued_negative_words):
                curr_distance = np.sum(np.square(valued_word - word))
                if curr_distance < curr_min_distance_vec[k]:
                    curr_min_distance_vec[k] = curr_distance
    train_embedded_X[i] = curr_min_distance_vec.copy()

train_embedded_X = list(train_embedded_X)

for i, entry in enumerate(test_embedded_X, start=405):
    curr_min_distance_vec = np.full(len(valued_negative_words), float('inf'))

    # for each word
    for j, word in enumerate(test_embedded_X[i]):
        if word is not None:
            # loop through valued words
            for k, valued_word in enumerate(valued_negative_words):
                curr_distance = np.sum(np.square(valued_word - word))
                if curr_distance < curr_min_distance_vec[k]:
                    curr_min_distance_vec[k] = curr_distance
    test_embedded_X[i] = curr_min_distance_vec.copy()

test_embedded_X = list(test_embedded_X)

In [44]:
embedded_clf_mnb = MultinomialNB()
embedded_clf_mnb.fit(train_embedded_X, train_y)
pred = embedded_clf_mnb.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred))

0.24


In [45]:
# simple lr on embeddings
embedded_clf_lr = LogisticRegression()
embedded_clf_lr.fit(train_embedded_X, train_y)
pred = embedded_clf_lr.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred))

0.285


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
# simple naive bayes on embeddings
embedded_clf_mnb = MultinomialNB()
embedded_clf_mnb.fit(train_embedded_X, train_y)
pred = embedded_clf_mnb.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred))

0.24


In [47]:
# simple lr on embeddings
embedded_clf_lr = LogisticRegression()
embedded_clf_lr.fit(train_embedded_X, train_y)
pred = embedded_clf_lr.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred))

0.285


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
embedded_clf_svc = SVC()
embedded_clf_svc.fit(train_embedded_X, train_y)
pred = embedded_clf_svc.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred))

0.295


In [None]:
# 3rd approach