In [1]:
def accuracy_ratio(y, predictions):
    total_correct = sum(predictions == y)
    return (total_correct/len(y))

In [2]:
import pandas as pd
import numpy as np

In [3]:
# data loading
df = pd.read_csv('data/Challenge1_Training_Scenarios.csv')
df.set_index('scenario_id', inplace=True)

In [4]:
# grab X and y values and split 80/20 train/test
X = df['scenario']
y = df['danger_level']

train_X = X[:404]
train_y = y[:404]

test_X = X[404:]
test_y = y[404:]

In [5]:
# create bows
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

#stopwords_adjusted = list(stopwords.words('english'))
vect = CountVectorizer(stop_words=stopwords.words('english'), ngram_range=(1,3))
train_counts = vect.fit_transform(train_X)
test_counts = vect.transform(test_X)

In [6]:
from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()
clf_mnb.fit(train_counts, train_y)

pred = clf_mnb.predict(test_counts)
print(accuracy_ratio(test_y, pred))

0.31


In [7]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(train_counts, train_y)

pred = np.rint(clf_lr.predict(test_counts))
print(accuracy_ratio(test_y, pred))

0.35


In [8]:
from sklearn.neighbors import KNeighborsClassifier
clf_kn = KNeighborsClassifier()
clf_kn.fit(train_counts, train_y)

pred = clf_kn.predict(test_counts)
print(accuracy_ratio(test_y, pred))

0.19


In [9]:
from sklearn.svm import SVC
clf_svc = SVC()
clf_svc.fit(train_counts, train_y)

pred = clf_svc.predict(test_counts)
print(accuracy_ratio(test_y, pred))

0.26


In [10]:
# word2vec to embed inside/outside
# linear regression matrix to get confidence levels of specific words
# stemming
# lemmatization
# stopwords remove in/out etc

In [11]:
# import word embedding packages
from nltk.tokenize import word_tokenize, RegexpTokenizer
import gensim
from gensim.models import Word2Vec
import gensim.downloader
import string
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [12]:
# grab embeddings for valued words and format
valued_negative_words = ['inside', 'crowd', 'touch', 'alcohol', 'kids', 'travel', 'airplane', 'elderly', 'illness', 'group', 'sick']
valued_positive_words = ['outside', 'alone', 'home', 'sanitation']

for i, word in enumerate(valued_negative_words):
    valued_negative_words[i] = glove_vectors[word]

In [13]:
# tokenize each entry then turn each word into vector
train_embedded_X = train_X.copy()
for i, entry in enumerate(train_embedded_X, start=1):
    train_embedded_X[i] = word_tokenize(train_embedded_X[i])
    train_embedded_X[i] = list(filter(lambda token: token not in string.punctuation, train_embedded_X[i]))
    for j, word in enumerate(train_embedded_X[i]):
        if train_embedded_X[i][j] in glove_vectors:
            train_embedded_X[i][j] = glove_vectors[train_embedded_X[i][j]]
        else:
            train_embedded_X[i][j] = None

In [14]:
test_embedded_X = test_X.copy()
for i, entry in enumerate(test_embedded_X, start=405):
    test_embedded_X[i] = word_tokenize(test_embedded_X[i])
    test_embedded_X[i] = list(filter(lambda token: token not in string.punctuation, test_embedded_X[i]))
    for j, word in enumerate(test_embedded_X[i]):
        if test_embedded_X[i][j] in glove_vectors:
            test_embedded_X[i][j] = glove_vectors[test_embedded_X[i][j]]
        else:
            test_embedded_X[i][j] = None

In [15]:
# for each entry
for i, entry in enumerate(train_embedded_X, start=1):
    # track total distance
    total_distance = 0
    # for each word
    for j, word in enumerate(train_embedded_X[i]):
        # if there was an embedding for this word
        if word is not None:
            closest_distance = float('inf')
            # find closest distance to a valued word
            for k, valued_word in enumerate(valued_negative_words):
                current_distance = np.sum(np.square(valued_word - word))
                if current_distance < closest_distance:
                    closest_distance = current_distance
            total_distance += closest_distance

    # average total distance based on number of words?
    # replace entry
    train_embedded_X[i] = total_distance/len(entry)

# shape this data into something useable for ml stuff
train_embedded_X = train_embedded_X.to_numpy().reshape(-1,1)

In [19]:
# same thing for test set

# for each entry
for i, entry in enumerate(test_embedded_X, start=405):
    # track total distance
    total_distance = 0
    # for each word
    for j, word in enumerate(test_embedded_X[i]):
        # if there was an embedding for this word
        if word is not None:
            closest_distance = float('inf')
            # find closest distance to a valued word
            for k, valued_word in enumerate(valued_negative_words):
                current_distance = np.sum(np.square(valued_word - word))
                if current_distance < closest_distance:
                    closest_distance = current_distance
            total_distance += closest_distance

    # average total distance based on number of words?
    # replace entry
    test_embedded_X[i] = total_distance/len(entry)

# shape this data into something useable for ml stuff
test_embedded_X = test_embedded_X.to_numpy().reshape(-1,1)

In [25]:
# simple naive bayes on embeddings
embedded_clf_mnb = MultinomialNB()
embedded_clf_mnb.fit(train_embedded_X, train_y)
pred = embedded_clf_mnb.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred))

0.19


In [26]:
# simple lr on embeddings
embedded_clf_lr = LogisticRegression()
embedded_clf_lr.fit(train_embedded_X, train_y)
pred = embedded_clf_lr.predict(test_embedded_X)
print(accuracy_ratio(test_y, pred))

0.16


In [27]:
# next steps would be finding maybe better metrics than total distance averaged, or combining this output with n-gram stuff