# NLP pipeline

In [8]:
import gzip
import urllib
from collections import defaultdict
import scipy
import scipy.optimize
import numpy as np
import random
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import mean_squared_error
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model
import ast
import math

In [9]:
rawdata = [eval(l) for l in gzip.open('train_Category.json.gz', 'rt', encoding="utf8")]

In [10]:
data = rawdata
data[0]

{'userID': 'u74382925',
 'genre': 'Adventure',
 'early_access': False,
 'reviewID': 'r75487422',
 'hours': 4.1,
 'text': 'Short Review:\nA good starting chapter for this series, despite the main character being annoying (for now) and a short length. The story is good and actually gets more interesting. Worth the try.\nLong Review:\nBlackwell Legacy is the first on the series of (supposedly) 5 games that talks about the main protagonist, Rosangela Blackwell, as being a so called Medium, and in this first chapter we get to know how her story will start and how she will meet her adventure companion Joey...and really, that\'s really all for for now and that\'s not a bad thing, because in a way this game wants to show how hard her new job is, and that she cannot escape her destiny as a medium.\nMy biggest complain for this chapter, except the short length, it\'s the main protagonist being a "bit" too annoying to be likeable, and most of her dialogues will always be about complaining or just

In [11]:
unigramCount = defaultdict(int)
bigramCount = defaultdict(int)
totalUnigrams = 0
totalBigrams = 0

In [12]:
punct = string.punctuation
stemmer = PorterStemmer()

In [13]:
for d in data:
    t = d['text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    
    if len(words) == 0:
        continue
    
    totalUnigrams += 1
    unigramCount[words[0]] += 1
    
    for i in range(len(words) - 1):
        #w = stemmer.stem(w)
        totalUnigrams += 1
        unigramCount[words[i+1]] += 1
        
        bigram = words[i] + " " + words[i+1]
        totalBigrams += 1
        bigramCount[bigram] += 1

In [14]:
totalUnigrams, totalBigrams

(12161378, 11987793)

In [15]:
len(unigramCount), len(bigramCount)

(160503, 2080628)

In [16]:
unigramPairs = [(unigramCount[w], w) for w in unigramCount]
bigramPairs = [(bigramCount[w], w) for w in bigramCount]

In [17]:
unigramPairs.sort()
unigramPairs.reverse()
bigramPairs.sort()
bigramPairs.reverse()

In [18]:
unigrams = [w[1] for w in unigramPairs[:1000]]
bigrams = [w[1] for w in bigramPairs[:1000]]

In [19]:
unigramId = dict(zip(unigrams, range(len(unigrams))))
unigramSet = set(unigrams)
bigramId = dict(zip(bigrams, range(len(bigrams))))
bigramSet = set(bigrams)

### Question 1

In [20]:
len(bigramCount)

2080628

In [21]:
bigramPairs[:5]

[(77409, 'this game'),
 (71358, 'the game'),
 (56774, 'of the'),
 (34040, 'if you'),
 (33628, 'in the')]

### Question 2

In [22]:
def bigramFeature(datum):
    feat = [0]*len(bigramSet)
    t = datum['text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    
    if len(words) != 0:
        for i in range(len(words) - 1):
            bigram = words[i] + words[i+1]
            if not (bigram in bigramSet): continue
            feat[bigramId[bigram]] += 1

    feat.append(1)
    return feat

In [23]:
X = [bigramFeature(d) for d in data]
y = [math.log(d['hours'] + 1, 2) for d in data]

In [24]:
model = linear_model.LinearRegression()
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
testdata = [eval(l) for l in gzip.open('test_Category.json.gz', 'rt', encoding="utf8")]

In [27]:
X_test = [bigramFeature(d) for d in data]
y_test = [math.log(d['hours'] + 1, 2) for d in data]

In [28]:
y_pred = model.predict(X_test)

In [29]:
mean_squared_error(y_test, y_pred)

5.304686532846429

### Question 3

Since the top unigrams are more common than the top bigrams, I decided to simply combine the top 500 unigrams and top 500 bigrams.

In [30]:
both = unigrams[:500] + bigrams[:500]

In [31]:
bothId = dict(zip(both, range(len(both))))
bothSet = set(both)

In [32]:
def bothFeature(datum):
    feat = [0]*len(bothSet)
    t = datum['text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    
    if len(words) != 0:
        if words[0] in bothSet:
            feat[bothId[words[0]]] += 1
        
        for i in range(len(words) - 1):
            if words[i+1] in bothSet:
                feat[bothId[words[i+1]]] += 1
            
            bigram = words[i] + words[i+1]
            if bigram in bothSet:
                feat[bothId[bigram]] += 1

    feat.append(1)
    return feat

In [33]:
X = [bothFeature(d) for d in data]

In [34]:
model = linear_model.LinearRegression()
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [35]:
X_test = [bothFeature(d) for d in data]

In [36]:
y_pred = model.predict(X_test)

In [37]:
mean_squared_error(y_test, y_pred)

4.800181543286911

### Question 4

In [38]:
def wordsFromDoc(doc):
    doc = doc.lower()
    doc = [c for c in doc if not (c in punct)]
    doc = ''.join(doc)
    
    return doc.strip().split()

In [39]:
def TF(term, doc):
    words = wordsFromDoc(doc)
    count = 0
    
    for w in words:
        if w == term:
            count += 1
            
    return count

In [40]:
def IDF(term, Docs):
    N = len(Docs)
    count = 0
    
    for doc in Docs:
        words = wordsFromDoc(doc)
        if term in words:
            count += 1
            
    return math.log(N / count, 10)

In [41]:
Docs = [d['text'] for d in data]

In [42]:
queries = {'destiny': None, 'annoying': None, 'likeable': None, 'chapter': None, 'interesting': None}

print("idf scores")
for query in queries:
    idf = IDF(query, Docs)
    queries[query] = idf
    print(f"{query}: {idf}")

idf scores
destiny: 3.1159332503214863
annoying: 1.7917122401967747
likeable: 3.0786851929018573
chapter: 2.2980621402742463
interesting: 1.360969104250146


In [43]:
rID = 'r75487422'
review = [d for d in data if d['reviewID'] == rID][0]

In [44]:
print("tf-idf scores")
for query in queries:
    tfidf = TF(query, review['text']) * queries[query]
    print(f"{query}: {tfidf}")

tf-idf scores
destiny: 3.1159332503214863
annoying: 3.5834244803935493
likeable: 6.1573703858037145
chapter: 6.894186420822739
interesting: 2.721938208500292


### Question 5

In [45]:
idfCounts = defaultdict(int)

for d in data:
    t = d['text']
    words = set(wordsFromDoc(t))
    
    for w in words:
        if w in unigramSet:
            idfCounts[w] += 1

In [46]:
idfs = defaultdict(float)

N = len(data)
for w in unigrams:
    idfs[w] = math.log(N / idfCounts[w], 10)

In [47]:
idfs['annoying']

1.7917122401967747

In [48]:
def feature(datum):
    feat = [0]*len(unigramSet)
    doc = datum['text']
    words = wordsFromDoc(doc)
    
    wordCounts = defaultdict(int)
    for w in words:
        if w in unigramSet:
            wordCounts[w] += 1
            
    for i in range(len(unigrams)):
        w = unigrams[i]
        feat[i] = wordCounts[w] * idfs[w]
    
    feat.append(1)
    return feat

In [49]:
feat = feature(review)

In [50]:
feat[unigrams.index('annoying')]

3.5834244803935493

In [51]:
X = [feature(d) for d in data]

In [52]:
model = linear_model.LinearRegression()
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
X_test = [feature(d) for d in data]

In [None]:
y_pred = model.predict(X_test)

In [None]:
mean_squared_error(y_test, y_pred)

### Question 6

In [None]:
def cosine(A, B):
    if len(A) != len(B):
        return 0
    
    dot = 0
    A_squared = 0
    B_squared = 0
    
    for i in range(len(A) - 1): # Subtract 1 to ignore the 1 constant
        dot += A[i] * B[i]
        A_squared += A[i] ** 2
        B_squared += B[i] ** 2
    
    A_mag = math.sqrt(A_squared)
    B_mag = math.sqrt(B_squared)
    denom = A_mag * B_mag
    
    if denom == 0:
        return 0
    
    return dot / denom

In [None]:
max_sim = -1
index = -1

review_vect = feature(review)
review_index = data.index(review)

for i in range(len(X)):
    if i == review_index:
        continue
    
    sim = cosine(review_vect, X[i])
    if sim > max_sim:
        max_sim = sim
        index = i

index, max_sim

In [None]:
data[index]['reviewID']

### Question 7

In [None]:
data = rawdata
random.shuffle(data)
data = data[:30000]

y = [math.log(d['hours'] + 1, 2) for d in data]
y_train = y[:10000]
y_valid = y[10000:20000]
y_test = y[20000:]

In [None]:
unigramCount = defaultdict(int)
bigramCount = defaultdict(int)
totalUnigrams = 0
totalBigrams = 0

In [None]:
punct = string.punctuation

In [None]:
for d in data:
    t = d['text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)]
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    
    if len(words) == 0:
        continue
    
    totalUnigrams += 1
    unigramCount[words[0]] += 1
    
    for i in range(len(words) - 1):
        #w = stemmer.stem(w)
        totalUnigrams += 1
        unigramCount[words[i+1]] += 1
        
        bigram = words[i] + " " + words[i+1]
        totalBigrams += 1
        bigramCount[bigram] += 1

In [None]:
unigramPairs = [(unigramCount[w], w) for w in unigramCount]
bigramPairs = [(bigramCount[w], w) for w in bigramCount]

In [None]:
unigramPairs.sort()
unigramPairs.reverse()
bigramPairs.sort()
bigramPairs.reverse()

In [None]:
# bigramPairs[990:1000]

In [None]:
unigrams = [w[1] for w in unigramPairs[:1000]]
bigrams = [w[1] for w in bigramPairs[:1000]]

In [None]:
unigramId = dict(zip(unigrams, range(len(unigrams))))
unigramSet = set(unigrams)
bigramId = dict(zip(bigrams, range(len(bigrams))))
bigramSet = set(bigrams)

In [None]:
unigramCount = defaultdict(int)
bigramCount = defaultdict(int)
totalUnigrams = 0
totalBigrams = 0

In [None]:
for d in data:
    t = d['text']
    t = t.lower() # lowercase string
    t = [c if (c not in punct) else f' {c} ' for c in t]
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    
    if len(words) == 0:
        continue
    
    totalUnigrams += 1
    unigramCount[words[0]] += 1
    
    for i in range(len(words) - 1):
        #w = stemmer.stem(w)
        totalUnigrams += 1
        unigramCount[words[i+1]] += 1
        
        bigram = words[i] + " " + words[i+1]
        totalBigrams += 1
        bigramCount[bigram] += 1

In [None]:
unigramPairs = [(unigramCount[w], w) for w in unigramCount]
bigramPairs = [(bigramCount[w], w) for w in bigramCount]

In [None]:
unigramPairs.sort()
unigramPairs.reverse()
bigramPairs.sort()
bigramPairs.reverse()

In [None]:
unigramsPunc = [w[1] for w in unigramPairs[:1000]]
bigramsPunc = [w[1] for w in bigramPairs[:1000]]

In [None]:
unigramPuncId = dict(zip(unigramsPunc, range(len(unigramsPunc))))
unigramPuncSet = set(unigramsPunc)
bigramPuncId = dict(zip(bigramsPunc, range(len(bigramsPunc))))
bigramPuncSet = set(bigramsPunc)

In [None]:
idfUniCounts = defaultdict(int)
idfBiCounts = defaultdict(int)

for d in data:
    t = d['text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    
    if len(words) != 0:
        usedWords = set()
        if words[0] in unigramSet:
            idfUniCounts[words[0]] += 1
        usedWords.add(words[0])
        
        for i in range(len(words) - 1):
            if words[i+1] not in usedWords and words[i+1] in unigramSet:
                idfUniCounts[words[i+1]] += 1
                
            bi = words[i] + " " + words[i+1]
            if bi not in usedWords and bi in bigramSet:
                idfBiCounts[bi] += 1
            
            usedWords.add(words[i+1])
            usedWords.add(bi)

In [None]:
idfUniCountsPunc = defaultdict(int)
idfBiCountsPunc = defaultdict(int)

for d in data:
    t = d['text']
    t = t.lower() # lowercase string
    t = [c if (c not in punct) else f' {c} ' for c in t]
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    
    if len(words) != 0:
        usedWords = set()
        if words[0] in unigramPuncSet:
            idfUniCountsPunc[words[0]] += 1
        usedWords.add(words[0])
        
        for i in range(len(words) - 1):
            if words[i+1] not in usedWords and words[i+1] in unigramPuncSet:
                idfUniCountsPunc[words[i+1]] += 1
                
            bi = words[i] + " " + words[i+1]
            if bi not in usedWords and bi in bigramPuncSet:
                idfBiCountsPunc[bi] += 1

            usedWords.add(words[i+1])
            usedWords.add(bi)

In [None]:
idfs_uni = defaultdict(float)
idfs_bi = defaultdict(float)

N = len(data)
for w in unigrams:
    idfs_uni[w] = math.log(N / idfUniCounts[w], 10)

for w in bigrams:
    idfs_bi[w] = math.log(N / idfBiCounts[w], 10)

In [None]:
idfs_uni_punc = defaultdict(float)
idfs_bi_punc = defaultdict(float)

N = len(data)
for w in unigramsPunc:
    idfs_uni_punc[w] = math.log(N / idfUniCountsPunc[w], 10)

for w in bigramsPunc:
    idfs_bi_punc[w] = math.log(N / idfBiCountsPunc[w], 10)

In [None]:
def featurePuncOn(datum, unigram, tfidf):
    feat = [0]*1000
    t = datum['text']
    t = t.lower() # lowercase string
    t = [c if (c not in punct) else f' {c} ' for c in t]
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes

    wordCounts = defaultdict(int)
    if unigram and not tfidf:
        for w in words:
            if w in unigramPuncSet:
                feat[unigramPuncId[w]] += 1
                
    elif unigram and tfidf:
        for w in words:
            if w in unigramPuncSet:
                wordCounts[w] += 1
                
        for w in unigramsPunc:
            feat[unigramPuncId[w]] = wordCounts[w] * idfs_uni_punc[w]
                
    elif not unigram and not tfidf:
        for i in range(len(words) - 1):
            bi = words[i] + " " + words[i+1]
            if bi in bigramPuncSet:
                feat[bigramPuncId[bi]] += 1
    else:
        for i in range(len(words) - 1):
            bi = words[i] + " " + words[i+1]
            if bi in bigramPuncSet:
                wordCounts[bi] += 1
                
        for w in bigramsPunc:
            feat[bigramPuncId[w]] = wordCounts[w] * idfs_bi_punc[w]

    feat.append(1)
    return feat

In [None]:
sum(featurePuncOn(data[0], False, True))

In [None]:
def featurePuncOff(datum, unigram, tfidf):
    feat = [0]*1000
    t = datum['text']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)]
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes

    wordCounts = defaultdict(int)
    if unigram and not tfidf:
        for w in words:
            if w in unigramSet:
                feat[unigramId[w]] += 1
                
    elif unigram and tfidf:
        for w in words:
            if w in unigramSet:
                wordCounts[w] += 1
                
        for w in unigrams:
            feat[unigramId[w]] = wordCounts[w] * idfs_uni[w]
                
    elif not unigram and not tfidf:
        for i in range(len(words) - 1):
            bi = words[i] + " " + words[i+1]
            if bi in bigramSet:
                feat[bigramId[bi]] += 1
    else:
        for i in range(len(words) - 1):
            bi = words[i] + " " + words[i+1]
            if bi in bigramSet:
                wordCounts[bi] += 1
                
        for w in bigrams:
            feat[bigramId[w]] = wordCounts[w] * idfs_bi[w]

    feat.append(1)
    return feat

In [None]:
X_labels = []
X_train_list = []

for uni in [True, False]:
    for tf in [True, False]:
        for pun in [True, False]:
            label = ""
            label += "Uni-" if uni else "Bi-"
            label += "Punct-" if pun else "NoPunct-"
            label += "TFIDF" if tf else "Counts"
            X_labels.append(label)
            new_X = []
            if pun:
                new_X = [featurePuncOn(d, uni, tf) for d in data]
            else:
                new_X = [featurePuncOff(d, uni, tf) for d in data]
            X_train_list.append(new_X)

In [None]:
X_pairs = zip(X_labels, X_train_list)

In [None]:
regs = [0.01, 0.1, 1, 10, 100]
for X_name, X in X_pairs:
    X_train = X[:10000]
    X_valid = X[10000:20000]
    X_test = X[20000:]
    
    MSEs = []
    testMSEs = []
    for c in regs:
        model = linear_model.Ridge(c, fit_intercept=False)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_valid)
        MSEs.append(mean_squared_error(y_valid, y_pred))
        
        y_pred = model.predict(X_test)
        testMSEs.append(mean_squared_error(y_test, y_pred))
        
    bestIndex = MSEs.index(min(MSEs))
    bestC = regs[bestIndex]
    testMSE = testMSEs[bestIndex]
    
    result = X_name + ": "
    for i in range(len(regs)):
        result += f"{regs[i]}=>{MSEs[i]}, "
    
    result += f"best MSE: {bestC}=>{testMSE}\n"
    
    print(result)

Note: the "best MSE" refers to the test MSE of the best model running on the validation set.