In [1]:
def accuracy_ratio(y, predictions):
    zipped = zip(y, predictions)
    total_points = 0
    for curr in zipped:
        if curr[0] == curr[1]:
            total_points += 2
        elif curr[0] == curr[1]+1 or curr[0] == curr[1]-1:
            total_points += 1
        else:
            total_points += 0

    return (total_points/(len(y)*2))

In [2]:
def view_predictions(X, y, pred):
    visualize = pd.DataFrame(list(zip(list(X), y, list(pred))))
    visualize['diff'] = list(np.subtract(y, pred))
    visualize = visualize.sort_values(by = 'diff')
    pd.set_option("display.max_rows", None)
    print(visualize.head())

In [3]:
import pandas as pd
import numpy as np

# import word embedding packages
from nltk.tokenize import word_tokenize, RegexpTokenizer
import gensim
from gensim.models import Word2Vec
import gensim.downloader
import string
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [4]:
# data loading
df = pd.read_csv('data/Challenge1_Training_Scenarios.csv')
df.set_index('scenario_id', inplace=True)

In [5]:
# grab X and y values and split 80/20 train/test
X = df['scenario']
y = df['danger_level']

train_X = X[:404]
train_y = y[:404]

test_X = X[404:]
test_y = y[404:]

In [6]:
# 1st approach

In [7]:
# create bows
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

vect = TfidfVectorizer(stop_words=stopwords.words('english'), ngram_range=(1,3))
train_counts = vect.fit_transform(train_X)
test_counts = vect.transform(test_X)

In [8]:
from sklearn.naive_bayes import MultinomialNB
bow_mnb = MultinomialNB()
bow_mnb.fit(train_counts, train_y)

pred_bow_mnb = bow_mnb.predict(test_counts)
print(accuracy_ratio(test_y, pred_bow_mnb))

0.4


In [9]:
from sklearn.linear_model import LogisticRegression
bow_lr = LogisticRegression()
bow_lr.fit(train_counts, train_y)

pred_bow_lr = np.rint(bow_lr.predict(test_counts))
print(accuracy_ratio(test_y, pred_bow_lr))

0.435


In [10]:
from sklearn.neighbors import KNeighborsClassifier
bow_kn = KNeighborsClassifier()
bow_kn.fit(train_counts, train_y)

pred_bow_kn = bow_kn.predict(test_counts)
print(accuracy_ratio(test_y, pred_bow_kn))

0.37


In [11]:
from sklearn.svm import SVC
bow_svc = SVC()
bow_svc.fit(train_counts, train_y)

pred_bow_svc = bow_svc.predict(test_counts)
print(accuracy_ratio(test_y, pred_bow_svc))

0.335


In [12]:
view_predictions(test_X, test_y, pred_bow_svc)

                                                    0  1  2  diff
36  A 52 year old man goes to a farmer's market. I...  1  6    -5
95  A 58 year old woman with blood clots goes to e...  1  6    -5
4   A 20 year old homeless man went to the homeles...  1  6    -5
11  A 69 year old man plays tennis with his partne...  1  6    -5
55  An 29 year old woman with diabetes goes skiing...  1  6    -5


In [13]:
# 2nd approach

In [14]:
def tokenize_remove_punctuation(input):
    input = input.lower()
    input = word_tokenize(input)
    input = list(filter(lambda token: token not in string.punctuation, input))
    return input

In [15]:
# grab embeddings for valued words and format
valued_negative_words = ['elderly', 'mask', 'social', 'coronavirus', 'home', 'work', 'outside']

for i, word in enumerate(valued_negative_words):
    valued_negative_words[i] = glove_vectors[word]

In [16]:
# transform to embeddings for each entry
train_embedded_X = train_X.copy()
for i, entry in enumerate(train_embedded_X, start=1):
    train_embedded_X[i] = tokenize_remove_punctuation(train_embedded_X[i])

    if 'covid-19' in train_embedded_X[i]:
        train_embedded_X[i][train_embedded_X[i].index('covid-19')] = 'coronavirus'
    if 'covid' in train_embedded_X[i]:
        train_embedded_X[i][train_embedded_X[i].index('covid')] = 'coronavirus'

    for j, word in enumerate(train_embedded_X[i]):
        if train_embedded_X[i][j] in glove_vectors:
            train_embedded_X[i][j] = glove_vectors[train_embedded_X[i][j]]
        else:
            train_embedded_X[i][j] = None

            
test_embedded_X = test_X.copy()
for i, entry in enumerate(test_embedded_X, start=405):
    test_embedded_X[i] = tokenize_remove_punctuation(test_embedded_X[i])
    
    if 'covid-19' in test_embedded_X[i]:
        test_embedded_X[i][test_embedded_X[i].index('covid-19')] = 'coronavirus'
    if 'covid' in test_embedded_X[i]:
        test_embedded_X[i][test_embedded_X[i].index('covid')] = 'coronavirus'
    
    for j, word in enumerate(test_embedded_X[i]):
        if test_embedded_X[i][j] in glove_vectors:
            test_embedded_X[i][j] = glove_vectors[test_embedded_X[i][j]]
        else:
            test_embedded_X[i][j] = None

In [17]:
# vector of closest distances to each valued word approach
for i, entry in enumerate(train_embedded_X, start=1):
    curr_min_distance_vec = np.full(len(valued_negative_words), float('inf'))

    # for each word
    for j, word in enumerate(train_embedded_X[i]):
        if word is not None:
            # loop through valued words
            for k, valued_word in enumerate(valued_negative_words):
                curr_distance = np.sum(np.square(valued_word - word))
                if curr_distance < curr_min_distance_vec[k]:
                    curr_min_distance_vec[k] = curr_distance
    train_embedded_X[i] = curr_min_distance_vec.copy()

train_embedded_X = list(train_embedded_X)

for i, entry in enumerate(test_embedded_X, start=405):
    curr_min_distance_vec = np.full(len(valued_negative_words), float('inf'))

    # for each word
    for j, word in enumerate(test_embedded_X[i]):
        if word is not None:
            # loop through valued words
            for k, valued_word in enumerate(valued_negative_words):
                curr_distance = np.sum(np.square(valued_word - word))
                if curr_distance < curr_min_distance_vec[k]:
                    curr_min_distance_vec[k] = curr_distance
    test_embedded_X[i] = curr_min_distance_vec.copy()

test_embedded_X = list(test_embedded_X)

In [18]:
embedded_mnb = MultinomialNB()
embedded_mnb.fit(train_embedded_X, train_y)
pred_embedded_mnb = embedded_mnb.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred_embedded_mnb))

0.4


In [22]:
from sklearn.linear_model import LinearRegression
embedded_lr = LinearRegression()
embedded_lr.fit(train_embedded_X, train_y)
pred_embedded_lr = embedded_lr.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred_embedded_lr))

0.0


In [23]:
pred_embedded_lr

array([3.67643496, 2.7925458 , 3.42710503, 2.86616494, 3.96243855,
       2.93841361, 4.55706271, 3.69348878, 2.41440174, 3.89641868,
       2.79309774, 3.62734519, 3.45383744, 3.15670909, 3.66807691,
       2.87756498, 3.330772  , 3.76304598, 2.61357825, 2.79501261,
       3.56786778, 3.34173292, 4.15458084, 3.11201766, 3.07843819,
       3.01378609, 3.4444302 , 1.82534976, 3.5752202 , 3.69449868,
       3.70226748, 3.05864606, 4.2302198 , 4.72838476, 2.86869873,
       3.40357445, 3.04338623, 3.75071324, 3.0765811 , 3.05263395,
       3.27462303, 2.70756476, 2.90655509, 3.63332865, 4.04299814,
       3.94373033, 3.76766954, 4.28121327, 2.76090658, 3.71365595,
       3.74628464, 3.51316902, 3.75905777, 3.81288495, 3.44138843,
       3.41072533, 3.23937867, 3.06730061, 3.33571318, 4.04393837,
       3.98011626, 2.77910191, 3.90103739, 3.66311531, 3.47624922,
       3.34366594, 3.65090252, 3.91031706, 4.45020344, 4.18591638,
       2.60008105, 3.65993739, 3.88483474, 3.77511907, 3.99827

In [20]:
embedded_svc = SVC()
embedded_svc.fit(train_embedded_X, train_y)
pred_embedded_svc = embedded_svc.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred_embedded_svc))

0.41


In [None]:
# 3rd approach