In [128]:
import numpy as np
import urllib.request
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model

In [6]:
def parseData(fname):
    for l in urllib.request.urlopen(fname):
        yield eval(l)

In [7]:
print ("Reading data......")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print ("done")

Reading data......
done


In [175]:
print (data[1])

{'review/appearance': 3.0, 'beer/style': 'English Strong Ale', 'review/palate': 3.0, 'review/taste': 3.0, 'beer/name': 'Red Moon', 'review/timeUnix': 1235915097, 'beer/ABV': 6.2, 'beer/beerId': '48213', 'beer/brewerId': '10325', 'review/timeStruct': {'isdst': 0, 'mday': 1, 'hour': 13, 'min': 44, 'sec': 57, 'mon': 3, 'year': 2009, 'yday': 60, 'wday': 6}, 'review/overall': 3.0, 'review/text': 'Dark red color, light beige foam, average.\tIn the smell malt and caramel, not really light.\tAgain malt and caramel in the taste, not bad in the end.\tMaybe a note of honey in teh back, and a light fruitiness.\tAverage body.\tIn the aftertaste a light bitterness, with the malt and red fruit.\tNothing exceptional, but not bad, drinkable beer.', 'user/profileName': 'stcules', 'review/aroma': 2.5}


In [225]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w in r.split():
        wordCount[w] += 1

In [226]:
print (f"All together there are {len(wordCount)} words")

All together there are 19426 words


In [265]:
counts_w = [(wordCount[w], w) for w in wordCount]
counts_w.sort()
counts_w.reverse()
words = [x[1] for x in counts_w[:1000]]
wordId = dict(zip(words, range(len(words))))

In [227]:
text_list = defaultdict(list)
for i in range(len(data)):
    r = ''.join([c for c in data[i]['review/text'].lower() if not c in punctuation])
    text_list[i] = r.split()

In [309]:
text_list_with_punc = defaultdict(list)
for i in range(len(data)):
    r = ''.join([c if not c in punctuation else ' '+c+' ' for c in data[i]['review/text'].lower()])
    text_list_with_punc[i] = r.split()

In [306]:
print (text_list[0])

['a', 'lot', 'of', 'foam', 'but', 'a', 'lot', 'in', 'the', 'smell', 'some', 'banana', 'and', 'then', 'lactic', 'and', 'tart', 'not', 'a', 'good', 'start', 'quite', 'dark', 'orange', 'in', 'color', 'with', 'a', 'lively', 'carbonation', 'now', 'visible', 'under', 'the', 'foam', 'again', 'tending', 'to', 'lactic', 'sourness', 'same', 'for', 'the', 'taste', 'with', 'some', 'yeast', 'and', 'banana']


In [310]:
print (text_list_with_punc[0])

['a', 'lot', 'of', 'foam', '.', 'but', 'a', 'lot', '.', 'in', 'the', 'smell', 'some', 'banana', ',', 'and', 'then', 'lactic', 'and', 'tart', '.', 'not', 'a', 'good', 'start', '.', 'quite', 'dark', 'orange', 'in', 'color', ',', 'with', 'a', 'lively', 'carbonation', '(', 'now', 'visible', ',', 'under', 'the', 'foam', ')', '.', 'again', 'tending', 'to', 'lactic', 'sourness', '.', 'same', 'for', 'the', 'taste', '.', 'with', 'some', 'yeast', 'and', 'banana', '.']


In [229]:
# compute the freq of word in all documents
each_word_freq_doc = defaultdict(int)
for each_word in wordCount:
    freq = 0
    for i in range(len(text_list)):
        if each_word in text_list[i]:
            freq += 1
    each_word_freq_doc[each_word] = freq

In [230]:
Bigram = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for i in range(len(r.split())-1):
        Bigram[r.split()[i]+"-"+r.split()[i+1]] += 1

In [231]:
print (max(zip(Bigram.values(),Bigram.keys())))

(4587, 'with-a')


In [232]:
print(Bigram['with-a'])

4587


In [233]:
print (Bigram['deal-with'])

5


In [234]:
counts = [(Bigram[biw], biw) for biw in Bigram]
counts.sort()
counts.reverse()

In [235]:
bi_words = [x[1] for x in counts[:1000]]

In [236]:
# sentiment analysis
bi_wordID = dict(zip(bi_words, range(len(bi_words))))

In [237]:
# print (bi_wordID)
bi_wordSet = set(bi_words)

In [238]:
def feature(datum):
    feat = [0]*len(bi_words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    for i in range(len(r.split())-1):
        bi_unit = r.split()[i]+"-"+r.split()[i+1]
        if  bi_unit in bi_words:
            feat[bi_wordID[bi_unit]] += 1
    feat.append(1)
    return feat

In [239]:
X = [feature(d) for d in data]

In [240]:
Y = [d['review/overall'] for d in data]

In [241]:
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X,Y)
theta = clf.coef_
predictions = clf.predict(X)

In [242]:
print (predictions)

[3.48471909 3.31957086 3.54264439 ... 5.20157626 3.53660705 4.27659128]


In [243]:
# report the MSE on the 5000 data
MSE = sum([(predictions[i]-Y[i])**2 for i in range(len(Y))])/len(Y)

In [244]:
print (f"MSE obtained using the new predictor: {MSE}")

MSE obtained using the new predictor: 0.3431530140613639


In [245]:
# total number of documents is N = 5000
# Compute IDF for ‘foam’, ‘smell’, ‘banana’, ‘lactic’, and ‘tart’
goal_word = ['foam', 'smell', 'banana', 'lactic', 'tart']
freq = defaultdict(int)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for elem in goal_word:
        if elem in r.split():
            freq[elem] += 1
print (freq)

defaultdict(<class 'int'>, {'foam': 364, 'smell': 1449, 'banana': 105, 'lactic': 6, 'tart': 78})


In [246]:
IDF = [np.log10(5000/freq[elem]) for elem in freq]

In [247]:
print (IDF)

[1.1378686206869628, 0.5379016188648442, 1.6777807052660807, 2.9208187539523753, 1.8068754016455384]


In [248]:
# tf: number of times the term appears in the document
first_review = data[0]['review/text']
r = ''.join([c for c in first_review.lower() if not c in punctuation])
tf = [0]*len(goal_word)
for i in range(len(goal_word)):
    for elem in r.split():
        if elem == goal_word[i]:
            tf[i] += 1
print (tf)

[2, 1, 2, 2, 1]


In [249]:
# Compute TF-IDF
TF_IDF = [tf[i]*IDF[i] for i in range(len(tf))]

In [250]:
print (TF_IDF)

[2.2757372413739256, 0.5379016188648442, 3.3555614105321614, 5.841637507904751, 1.8068754016455384]


In [251]:
def ComputeCosineSImilarity(x, y):
    res = 0
    for elem_x in x:
        if elem_x in y:
            res = res + x[elem_x]*y[elem_x]
    part1 = (sum([x[elem]**2 for elem in x]))**(1/2)
    part2 = (sum([y[elem]**2 for elem in y]))**(1/2)
    return res/(part1*part2)

In [252]:
def computeTF_IDF(gword, text_list, each_word_freq_doc, index):
    unit_freq = each_word_freq_doc[gword]
    tf = 0
    for elem in text_list[index]:
        if elem == gword:
            tf += 1
    return tf*(np.log10(5000/(1+unit_freq)))

In [253]:
# Build tf-idf vector
def tf_idf_builder(index, text_list, ):
    review_w = text_list[index]
    rev_vec = defaultdict(float)
    for elem in review_w:
        if elem not in rev_vec:
            rev_vec[elem] = computeTF_IDF(elem, text_list, each_word_freq_doc, index)
    return rev_vec

In [276]:
# construct tf-idf vector for review1 and review2
review1w =  text_list[0]
review2w =  text_list[1]
rev_1_vec = tf_idf_builder(0, text_list)
rev_2_vec = tf_idf_builder(1, text_list)
print (rev_1_vec)

defaultdict(<class 'float'>, {'a': 0.02414000721952438, 'lot': 2.022882086242769, 'of': 0.05148923116234282, 'foam': 2.273354279759088, 'but': 0.1662156253435211, 'in': 0.3494074711380801, 'the': 0.08645087743151567, 'smell': 0.5376020021010439, 'some': 0.6729750591696887, 'banana': 3.347328278142497, 'and': 0.09763689545177756, 'then': 1.0034883278458213, 'lactic': 5.707743928643524, 'tart': 1.8013429130455774, 'not': 0.28216313251307434, 'good': 0.3701828039814841, 'start': 1.4975728800155672, 'quite': 0.8096683018297085, 'dark': 0.5509846836522136, 'orange': 0.7894139750948435, 'color': 0.46042211665469096, 'with': 0.12416219470443926, 'lively': 1.9586073148417749, 'carbonation': 0.36916548217194944, 'now': 1.4023048140744876, 'visible': 1.9136401693252518, 'under': 1.7544873321858503, 'again': 0.8781120148963188, 'tending': 2.9208187539523753, 'to': 0.1305335899191335, 'sourness': 2.0861861476162833, 'same': 1.2856702402547668, 'for': 0.288192770958809, 'taste': 0.2912392763096833,

In [277]:
print (f"Cosine similarity between review1 and review2 is {ComputeCosineSImilarity(rev_1_vec, rev_2_vec)}")

Cosine similarity between review1 and review2 is 0.06691778465356775


In [256]:
# beer name
# text_of_review
# profile_name
max_similarity = -1.0
beer_name = data[0]['beer/name']
text_of_review = data[0]['review/text']
profile_name = data[0]['user/profileName']
for i in range(1, len(data)):
    new_vec = tf_idf_builder(i, text_list)
    simi =  ComputeCosineSImilarity(rev_1_vec, new_vec)
    if simi > max_similarity:
        max_similarity = simi
        beer_name = data[i]['beer/name']
        text_of_review = data[i]['review/text']
        profile_name = data[i]['user/profileName']
    if i % 1000 == 0:
        print (simi)

0.024680501910290382
0.0389329943130906
0.011928781805429233


  


0.007144706263166458


In [257]:
# output the goal_beer
print (beer_name)
print (text_of_review)
print (profile_name)
print (max_similarity)

Her Majesty 2011
750mL bottle thanks to Chris@Slowbeer. Poured into a Lost Abbey stemmed tulip.		Golden orange, close to translucent (on the first pour at least), capped by a sizable white, typically Belgian-looking head. Good lacing.		Quite strong lactic notes and a sharp organic funk. Pungent stuff. Underneath is bitter citrus pith, floral spice and a hint of sweet esters. In your face with a lot going on. Only issue is the lactic character verges on turning my stomach.		More citric sourness and a bit less lactic character. Grapefruit and lemon rind are prominent, as is the Nelson Sauvin vegetative character, which kind of adheres to the yeast and barnyard funk. Tropical melons and honey provide some sweetness. Decent peppery tang.		Medium, lightly syrupy body with lowish carbonation and a moderately tart, dry finish that has some length to it.		Incomparable to anything I've tried. The Sauvin hops with the Saison yeast is a masterful combination, however there's no shortage of rough 

In [360]:
def new_feature(index, text_list, each_word_freq_doc):
    feat = [0]*len(words)
    goal_review = text_list[index]
    for w in goal_review:
        if w in words and feat[wordId[w]] == 0:
            feat[wordId[w]] = computeTF_IDF(w, text_list, each_word_freq_doc, index)
    feat.append(1)
    return feat

In [271]:
X = [new_feature(index, text_list, each_word_freq_doc) for index in range(len(text_list))]

In [272]:
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X,Y)
theta = clf.coef_
predictions = clf.predict(X)

In [273]:
print (predictions)

[3.09655868 3.57328571 3.58483271 ... 4.290122   3.42816778 4.24918487]


In [274]:
MSE_TF_IDF = sum([(predictions[i]-Y[i])**2 for i in range(len(Y))])/len(Y)

In [275]:
print (f"MSE of predict model base on tfidf feature is {MSE_TF_IDF}")

MSE of predict model base on tfidf feature is 0.27875971411652656


In [311]:
# for question 7
# first we shuffle the data
print ("Reading data......")
data_all = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))
print ("done")

Reading data......
done


In [318]:
random.shuffle(data_all)

50000


In [319]:
# split the data into training set, validation set and testing set
data_train = data_all[:5000]
data_validate = data_all[5000+1:5001+5000]
data_test = data_all[5001+5000+1:5001+5000+1+5000]

In [339]:
# we store data in memory-unigram
# remove punctuation or not remove

In [324]:
text_list_m = defaultdict(list)
for i in range(len(data_train)):
    r = ''.join([c for c in data_train[i]['review/text'].lower() if not c in punctuation])
    text_list_m[i] = r.split()

In [325]:
text_list_with_punc_m = defaultdict(list)
for i in range(len(data_train)):
    r = ''.join([c if not c in punctuation else ' '+c+' ' for c in data_train[i]['review/text'].lower()])
    text_list_with_punc_m[i] = r.split()

In [341]:
# we store data in memory-bigram
# remove punctuation or not remove

In [334]:
bi_text_list_m = defaultdict(list)
for i in range(len(data_train)):
    bi_unit = []
    r = ''.join([c for c in data_train[i]['review/text'].lower() if not c in punctuation])
    for j in range(len(r.split())-1):
        bi_unit.append(r.split()[j]+"-"+r.split()[j+1])
    bi_text_list_m[i] = bi_unit

In [335]:
bi_text_list_with_punc_m = defaultdict(list)
for i in range(len(data_train)):
    bi_unit = []
    r = ''.join([c if not c in punctuation else ' '+c+' ' for c in data_train[i]['review/text'].lower()])
    for j in range(len(r.split())-1):
        bi_unit.append(r.split()[j]+"-"+r.split()[j+1])
    bi_text_list_with_punc_m[i] = bi_unit

In [343]:
# word count 
# remove punctuation or not remove

In [326]:
wordCount_m = defaultdict(int)
for i in range(len(text_list_m)):
    for w in text_list_m[i]:
        wordCount_m[w] += 1

In [327]:
wordCount_with_punc_m = defaultdict(int)
for i in range(len(text_list_with_punc_m)):
    for w in text_list_with_punc_m[i]:
        wordCount_with_punc_m[w] += 1

In [345]:
# bi_word count
# remove punctuation or not remove

In [336]:
bi_wordCount_m = defaultdict(int)
for i in range(len(bi_text_list_m)):
    for w in bi_text_list_m[i]:
        bi_wordCount_m[w] += 1

In [337]:
bi_wordCount_with_punc_m = defaultdict(int)
for i in range(len(bi_text_list_with_punc_m)):
    for w in bi_text_list_with_punc_m[i]:
        bi_wordCount_with_punc_m[w] += 1

In [347]:
# Select the top 1000 as features
counts_m = [(wordCount_m[w], w) for w in wordCount_m]
counts_m.sort()
counts_m.reverse()
words_m = [x[1] for x in counts_m[:1000]]
wordId_m = dict(zip(words_m, range(len(words_m))))

In [348]:
counts_with_punc_m = [(wordCount_with_punc_m[w], w) for w in wordCount_with_punc_m]
counts_with_punc_m.sort()
counts_with_punc_m.reverse()
words_with_punc_m = [x[1] for x in counts_with_punc_m[:1000]]
wordsId_with_punc_m = dict(zip(words_with_punc_m, range(len(words_with_punc_m))))

In [352]:
bi_counts_m = [(bi_wordCount_m[biw], biw) for biw in bi_wordCount_m]
bi_counts_m.sort()
bi_counts_m.reverse()
bi_words_m = [x[1] for x in bi_counts_m[:1000]]
bi_wordId_m = dict(zip(bi_words_m, range(len(bi_words_m))))

In [353]:
bi_counts_with_punc_m = [(bi_wordCount_with_punc_m[biw], biw) for biw in bi_wordCount_with_punc_m]
bi_counts_with_punc_m.sort()
bi_counts_with_punc_m.reverse()
bi_words_with_punc_m = [x[1] for x in bi_counts_with_punc_m[:1000]]
bi_wordId_with_punc_m = dict(zip(bi_words_with_punc_m, range(len(bi_words_with_punc_m))))

In [354]:
each_freq_doc_m = defaultdict(int)
for each_word in words_m:
    freq = 0
    for i in range(len(text_list_m)):
        if each_word in text_list_m[i]:
            freq += 1
    each_freq_doc_m[each_word] = freq

In [355]:
each_freq_doc_with_punc_m = defaultdict(int)
for each_word in words_with_punc_m:
    freq = 0
    for i in range(len(text_list_with_punc_m)):
        if each_word in text_list_with_punc_m[i]:
            freq += 1
    each_freq_doc_with_punc_m[each_word] = freq

In [358]:
bi_each_freq_doc_m = defaultdict(int)
for each_word in bi_words_m:
    freq = 0
    for i in range(len(bi_text_list_m)):
        if each_word in bi_text_list_m[i]:
            freq += 1
    bi_each_freq_doc_m[each_word] = freq

In [359]:
bi_each_freq_doc_with_punc_m = defaultdict(int)
for each_word in bi_words_with_punc_m:
    freq = 0
    for i in range(len(bi_text_list_with_punc_m)):
        if each_word in bi_text_list_with_punc_m[i]:
            freq += 1
    bi_each_freq_doc_with_punc_m[each_word] = freq

In [361]:
def computeTF_IDF(gword, content_list, each_freq, index):
    unit_freq = each_freq[gword]
    tf = 0
    for elem in content_list[index]:
        if elem == gword:
            tf += 1
    return tf*(np.log10(5000/(1+unit_freq)))

In [373]:
def tfidf_feature(index, contents_list, each_freq, unitset, unitid):
    feat = [0]*len(unitset)
    goal_review = contents_list[index]
    for w in goal_review:
        if w in unitset and feat[unitid[w]] == 0:
            feat[unitid[w]] = computeTF_IDF(w, contents_list, each_freq, index)
    feat.append(1)
    return feat

In [370]:
def freq_feature(index, contents_list, unitset, unitid):
    feat = [0]*len(unitset)
    goal_review = contents_list[index]
    for w in goal_review:
        if  w in unitset:
            feat[unitid[w]] += 1
    feat.append(1)
    return feat

In [382]:
##############################################

In [384]:
p_text_list_m = defaultdict(list)
for i in range(len(data_validate)):
    r = ''.join([c for c in data_validate[i]['review/text'].lower() if not c in punctuation])
    p_text_list_m[i] = r.split()

In [385]:
p_text_list_with_punc_m = defaultdict(list)
for i in range(len(data_validate)):
    r = ''.join([c if not c in punctuation else ' '+c+' ' for c in data_validate[i]['review/text'].lower()])
    p_text_list_with_punc_m[i] = r.split()

In [386]:
p_bi_text_list_m = defaultdict(list)
for i in range(len(data_validate)):
    bi_unit = []
    r = ''.join([c for c in data_validate[i]['review/text'].lower() if not c in punctuation])
    for j in range(len(r.split())-1):
        bi_unit.append(r.split()[j]+"-"+r.split()[j+1])
    p_bi_text_list_m[i] = bi_unit

In [388]:
p_bi_text_list_with_punc_m = defaultdict(list)
for i in range(len(data_validate)):
    bi_unit = []
    r = ''.join([c if not c in punctuation else ' '+c+' ' for c in data_validate[i]['review/text'].lower()])
    for j in range(len(r.split())-1):
        bi_unit.append(r.split()[j]+"-"+r.split()[j+1])
    p_bi_text_list_with_punc_m[i] = bi_unit

In [390]:
p_wordCount_m = defaultdict(int)
for i in range(len(p_text_list_m)):
    for w in p_text_list_m[i]:
        p_wordCount_m[w] += 1

In [391]:
p_wordCount_with_punc_m = defaultdict(int)
for i in range(len(p_text_list_with_punc_m)):
    for w in p_text_list_with_punc_m[i]:
        p_wordCount_with_punc_m[w] += 1

In [392]:
p_bi_wordCount_m = defaultdict(int)
for i in range(len(p_bi_text_list_m)):
    for w in p_bi_text_list_m[i]:
        p_bi_wordCount_m[w] += 1

In [393]:
p_bi_wordCount_with_punc_m = defaultdict(int)
for i in range(len(p_bi_text_list_with_punc_m)):
    for w in p_bi_text_list_with_punc_m[i]:
        p_bi_wordCount_with_punc_m[w] += 1

In [394]:
p_counts_m = [(p_wordCount_m[w], w) for w in p_wordCount_m]
p_counts_m.sort()
p_counts_m.reverse()
p_words_m = [x[1] for x in p_counts_m[:1000]]
p_wordId_m = dict(zip(p_words_m, range(len(p_words_m))))

In [395]:
p_counts_with_punc_m = [(p_wordCount_with_punc_m[w], w) for w in p_wordCount_with_punc_m]
p_counts_with_punc_m.sort()
p_counts_with_punc_m.reverse()
p_words_with_punc_m = [x[1] for x in p_counts_with_punc_m[:1000]]
p_wordsId_with_punc_m = dict(zip(p_words_with_punc_m, range(len(p_words_with_punc_m))))

In [396]:
p_bi_counts_m = [(p_bi_wordCount_m[biw], biw) for biw in p_bi_wordCount_m]
p_bi_counts_m.sort()
p_bi_counts_m.reverse()
p_bi_words_m = [x[1] for x in p_bi_counts_m[:1000]]
p_bi_wordId_m = dict(zip(p_bi_words_m, range(len(p_bi_words_m))))

In [397]:
p_bi_counts_with_punc_m = [(p_bi_wordCount_with_punc_m[biw], biw) for biw in p_bi_wordCount_with_punc_m]
p_bi_counts_with_punc_m.sort()
p_bi_counts_with_punc_m.reverse()
p_bi_words_with_punc_m = [x[1] for x in p_bi_counts_with_punc_m[:1000]]
p_bi_wordId_with_punc_m = dict(zip(p_bi_words_with_punc_m, range(len(p_bi_words_with_punc_m))))

In [398]:
p_each_freq_doc_m = defaultdict(int)
for each_word in p_words_m:
    freq = 0
    for i in range(len(p_text_list_m)):
        if each_word in p_text_list_m[i]:
            freq += 1
    p_each_freq_doc_m[each_word] = freq

In [399]:
p_each_freq_doc_with_punc_m = defaultdict(int)
for each_word in p_words_with_punc_m:
    freq = 0
    for i in range(len(p_text_list_with_punc_m)):
        if each_word in p_text_list_with_punc_m[i]:
            freq += 1
    p_each_freq_doc_with_punc_m[each_word] = freq

In [400]:
p_bi_each_freq_doc_m = defaultdict(int)
for each_word in p_bi_words_m:
    freq = 0
    for i in range(len(p_bi_text_list_m)):
        if each_word in p_bi_text_list_m[i]:
            freq += 1
    p_bi_each_freq_doc_m[each_word] = freq

In [401]:
p_bi_each_freq_doc_with_punc_m = defaultdict(int)
for each_word in p_bi_words_with_punc_m:
    freq = 0
    for i in range(len(p_bi_text_list_with_punc_m)):
        if each_word in p_bi_text_list_with_punc_m[i]:
            freq += 1
    p_bi_each_freq_doc_with_punc_m[each_word] = freq

In [402]:
pX_1 = [freq_feature(index, p_text_list_m, p_words_m, p_wordId_m) for index in range(len(p_text_list_m))]
pX_2 = [tfidf_feature(index, p_text_list_m, p_each_freq_doc_m, p_words_m, p_wordId_m) for index in range(len(p_text_list_m))]
pX_3 = [freq_feature(index, p_text_list_with_punc_m, p_words_with_punc_m, p_wordsId_with_punc_m) for index in range(len(p_text_list_with_punc_m))]
pX_4 = [tfidf_feature(index, p_text_list_with_punc_m, p_each_freq_doc_with_punc_m, p_words_with_punc_m, p_wordsId_with_punc_m) for index in range(len(p_text_list_with_punc_m))]
pX_5 = [freq_feature(index, p_bi_text_list_m, p_bi_words_m, p_bi_wordId_m) for index in range(len(p_bi_text_list_m))]
pX_6 = [tfidf_feature(index, p_bi_text_list_m, p_bi_each_freq_doc_m, p_bi_words_m, p_bi_wordId_m) for index in range(len(p_bi_text_list_m))]
pX_7 = [freq_feature(index, p_bi_text_list_with_punc_m, p_bi_words_with_punc_m, p_bi_wordId_with_punc_m) for index in range(len(p_bi_text_list_with_punc_m))]
pX_8 = [tfidf_feature(index, p_bi_text_list_with_punc_m, p_bi_each_freq_doc_with_punc_m, p_bi_words_with_punc_m, p_bi_wordId_with_punc_m) for index in range(len(p_bi_text_list_with_punc_m))]

In [403]:
Y_prime_val =  [d['review/overall'] for d in data_validate]

In [383]:
##############################################

In [362]:
Y = [d['review/overall'] for d in data_train]

In [371]:
# Unigram + remove + freq
X_1 = [freq_feature(index, text_list_m, words_m, wordId_m) for index in range(len(text_list_m))]

In [374]:
# Unigram + remove + tfidf
X_2 = [tfidf_feature(index, text_list_m, each_freq_doc_m, words_m, wordId_m) for index in range(len(text_list_m))]

In [375]:
# Unigram + not remove + freq
X_3 = [freq_feature(index, text_list_with_punc_m, words_with_punc_m, wordsId_with_punc_m) for index in range(len(text_list_with_punc_m))]

In [376]:
# Unigram + not remove + tfidf
X_4 = [tfidf_feature(index, text_list_with_punc_m, each_freq_doc_with_punc_m, words_with_punc_m, wordsId_with_punc_m) for index in range(len(text_list_with_punc_m))]

In [377]:
# Bigram + remove + freq
X_5 = [freq_feature(index, bi_text_list_m, bi_words_m, bi_wordId_m) for index in range(len(bi_text_list_m))]

In [380]:
# Bigram + remove + tfidf
X_6 = [tfidf_feature(index, bi_text_list_m, bi_each_freq_doc_m, bi_words_m, bi_wordId_m) for index in range(len(bi_text_list_m))]

In [379]:
# Bigram + not remove + freq
X_7 = [freq_feature(index, bi_text_list_with_punc_m, bi_words_with_punc_m, bi_wordId_with_punc_m) for index in range(len(bi_text_list_with_punc_m))]

In [381]:
# Bigram + not remove + tfidf
X_8 = [tfidf_feature(index, bi_text_list_with_punc_m, bi_each_freq_doc_with_punc_m, bi_words_with_punc_m, bi_wordId_with_punc_m) for index in range(len(bi_text_list_with_punc_m))]

In [405]:
def train_out_MSE(x, y, x_p, y_p):
    clf = linear_model.Ridge(1.0, fit_intercept=False)
    clf.fit(x,y)
    theta = clf.coef_
    predictions = clf.predict(x_p)
    MSE_temp = sum([(predictions[i]-y_p[i])**2 for i in range(len(y_p))])/len(y_p)
    return MSE_temp

In [406]:
# Unigram + remove + freq
print (train_out_MSE(X_1, Y, pX_1, Y_prime_val))

0.6717115308085453


In [407]:
# Unigram + remove + tfidf
print (train_out_MSE(X_2, Y, pX_2, Y_prime_val))

0.6767064601852669


In [408]:
# Unigram + not remove + freq
print (train_out_MSE(X_3, Y, pX_3, Y_prime_val))

0.6392805857415549


In [409]:
# Unigram + not remove + tfidf
print (train_out_MSE(X_4, Y, pX_4, Y_prime_val))

0.6450116486717069


In [410]:
# Bigram + remove + freq
print (train_out_MSE(X_5, Y, pX_5, Y_prime_val))

0.6155755380643481


In [411]:
# Bigram + remove + tfidf
print (train_out_MSE(X_6, Y, pX_6, Y_prime_val))

0.6189277231579557


In [412]:
# Bigram + not remove + freq
print (train_out_MSE(X_7, Y, pX_7, Y_prime_val))

0.6531938827236892


In [413]:
# Bigram + not remove + tfidf
print (train_out_MSE(X_8, Y, pX_8, Y_prime_val))

0.6588351631138297
