In [0]:
import gzip
import json
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import PorterStemmer 
from nltk import word_tokenize 
from nltk.util import ngrams
import re, string
import math

In [3]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [0]:
data = []

for d in readGz("drive/My Drive/cse 158/assignment1/train_Category.json.gz"):
    data.append(d)

df = pd.DataFrame(data)
del data

In [7]:
df.head()

Unnamed: 0,n_votes,review_id,user_id,review_text,rating,genreID,genre
0,0,r99763621,u17334941,Genuinely enthralling. If Collins or Bernard d...,5,2,fantasy_paranormal
1,0,r24440074,u08070901,Pretty decent. The ending seemed a little rush...,5,2,fantasy_paranormal
2,0,r82777443,u36921467,Philippa created an intricate world composed o...,4,2,fantasy_paranormal
3,0,r05772772,u07405640,"A very light, good, quirky read. I felt like I...",5,3,mystery_thriller_crime
4,0,r33622824,u10191516,"It was a Christmas gift from a friend, how kno...",5,2,fantasy_paranormal


In [8]:
review_text = df['review_text'].copy(deep=True).iloc[:10000]
review_text.head()

0    Genuinely enthralling. If Collins or Bernard d...
1    Pretty decent. The ending seemed a little rush...
2    Philippa created an intricate world composed o...
3    A very light, good, quirky read. I felt like I...
4    It was a Christmas gift from a friend, how kno...
Name: review_text, dtype: object

In [0]:
review_text = review_text.apply(lambda doc: re.sub('[%s]' % re.escape(string.punctuation), '', doc.lower()))

# Q1

In [11]:
bigrams_dict = {}
def save_bigrams(doc):
  token = word_tokenize(doc)
  bigram_list = list(ngrams(token, 2)) 
  for bigram in bigram_list:
    if bigram not in bigrams_dict:
      bigrams_dict[bigram] = 0
    bigrams_dict[bigram] += 1
  return doc
review_text.apply(save_bigrams)

0       genuinely enthralling if collins or bernard di...
1       pretty decent the ending seemed a little rush ...
2       philippa created an intricate world composed o...
3       a very light good quirky read i felt like i wa...
4       it was a christmas gift from a friend how know...
                              ...                        
9995    as posted on kindleobsessed blog \n well ladie...
9996    rating 4 out of 5 stars  \n friendship murder ...
9997    very great main character  paddy i really like...
9998    loved it loved it loved it \n although the end...
9999    im not really sure how to review this book and...
Name: review_text, Length: 10000, dtype: object

In [0]:
bigrams_freq_list = [(k, v) for k, v in bigrams_dict.items()]
del bigrams_dict

In [0]:
bigrams_freq_list.sort(key=lambda x: x[1], reverse=True)

In [14]:
bigrams_freq_list[:5]

[(('of', 'the'), 7927),
 (('this', 'book'), 5850),
 (('in', 'the'), 5627),
 (('and', 'the'), 3189),
 (('is', 'a'), 3183)]

Answer to Q1:

(Format = Bigrams : Freq)

1.   ('of', 'the'): 7927
2.   ('this', 'book'): 5850
3.   ('in', 'the'): 5627
4.   ('and', 'the'): 3189
5.   ('the', 'book'): 3183

# Q2

In [0]:
top_1000_bigrams = [bigram_pair[0] for bigram_pair in bigrams_freq_list[:1000]]
bigram_index_dict = {bigram: i for i, bigram in enumerate(top_1000_bigrams)}
del bigrams_freq_list

In [0]:
def top_1000_bigrams_feat(doc):
  token = word_tokenize(doc)
  bigram_list = list(ngrams(token, 2))
  bigram_feat = [0] * len(bigram_index_dict)
  for bigram in bigram_list:
    if bigram in bigram_index_dict:
      bigram_feat[bigram_index_dict[bigram]] += 1
  bigram_feat.append(1)
  return bigram_feat

In [0]:
X_q2 = review_text.apply(top_1000_bigrams_feat)
X_q2 = X_q2.apply(lambda x: pd.Series(x))
y_rating = df['rating'].iloc[:10000]

In [0]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_q2, y_rating)
theta = clf.coef_

In [0]:
def MSE(X, theta, y):
    res = np.matrix(X)*np.matrix(theta).T - np.matrix(y).T
    sq = np.array(res)**2
    MSE = sum(sq) / len(sq)
    return MSE

In [20]:
MSE(X_q2, theta, y_rating)

array([1.01806278])

In [0]:
del X_q2
del clf
del theta
del y_rating

Answer to Q2:

MSE = 1.01806278

# Q3

In [22]:
unigrams_dict = {}
def save_unigrams(doc):
  unigram_list = word_tokenize(doc)
  for unigram in unigram_list:
    if unigram not in unigrams_dict:
      unigrams_dict[unigram] = 0
    unigrams_dict[unigram] += 1
  return doc
review_text.apply(save_unigrams)

0       genuinely enthralling if collins or bernard di...
1       pretty decent the ending seemed a little rush ...
2       philippa created an intricate world composed o...
3       a very light good quirky read i felt like i wa...
4       it was a christmas gift from a friend how know...
                              ...                        
9995    as posted on kindleobsessed blog \n well ladie...
9996    rating 4 out of 5 stars  \n friendship murder ...
9997    very great main character  paddy i really like...
9998    loved it loved it loved it \n although the end...
9999    im not really sure how to review this book and...
Name: review_text, Length: 10000, dtype: object

In [0]:
unigrams_freq_list = [(k, v) for k, v in unigrams_dict.items()]
unigrams_freq_list.sort(key=lambda x: x[1], reverse=True)
del unigrams_dict

In [0]:
top_1000_uni_bigram = [unigram_pair[0] for unigram_pair in unigrams_freq_list[:400]] + \
                      top_1000_bigrams[:600]
uni_bigram_index_dict = {curr_gram: i for i, curr_gram in enumerate(top_1000_uni_bigram)}
del unigrams_freq_list

In [0]:
def top_1000_uni_bigrams_feat(doc):
  token = word_tokenize(doc)
  bigram_list = list(ngrams(token, 2))
  uni_bigram_feat = [0] * len(top_1000_bigrams)
  for bigram in bigram_list:
    if bigram in uni_bigram_index_dict:
      uni_bigram_feat[uni_bigram_index_dict[bigram]] += 1
  for unigram in token:
    if unigram in uni_bigram_index_dict:
      uni_bigram_feat[uni_bigram_index_dict[unigram]] += 1
  uni_bigram_feat.append(1)
  return uni_bigram_feat

In [0]:
X_q3 = review_text.apply(top_1000_uni_bigrams_feat)
X_q3 = X_q3.apply(lambda x: pd.Series(x))
y_rating = df['rating'].iloc[:10000]

In [0]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_q3, y_rating)
theta = clf.coef_

In [28]:
MSE(X_q3, theta, y_rating)

array([0.98249739])

In [0]:
del X_q3
del clf
del theta
del y_rating

Answer to Q3:

MSE = 0.98249739

# Q4

In [0]:
idf_words_q4 = {'stories': 0,
                'magician': 0,
                'psychic': 0,
                'writing': 0,
                'wonder': 0}

In [31]:
def save_idf_counter(doc):
  token_set = set(word_tokenize(doc))
  for curr_word in idf_words_q4.keys():
    if curr_word in token_set:
      idf_words_q4[curr_word] += 1
  return doc
review_text.apply(save_idf_counter)

0       genuinely enthralling if collins or bernard di...
1       pretty decent the ending seemed a little rush ...
2       philippa created an intricate world composed o...
3       a very light good quirky read i felt like i wa...
4       it was a christmas gift from a friend how know...
                              ...                        
9995    as posted on kindleobsessed blog \n well ladie...
9996    rating 4 out of 5 stars  \n friendship murder ...
9997    very great main character  paddy i really like...
9998    loved it loved it loved it \n although the end...
9999    im not really sure how to review this book and...
Name: review_text, Length: 10000, dtype: object

In [32]:
for k, v in idf_words_q4.items():
  print(k,':', math.log(review_text.shape[0] / v, 10))

stories : 1.1174754620451195
magician : 2.6575773191777934
psychic : 2.602059991327962
writing : 0.9978339382434922
wonder : 1.7670038896078457


In [0]:
tf_words_q4 = {'stories': 0,
               'magician': 0,
               'psychic': 0,
               'writing': 0,
               'wonder': 0}

In [0]:
for curr_word in review_text[0].split():
  if curr_word in tf_words_q4:
    tf_words_q4[curr_word] += 1

In [35]:
for k, v in tf_words_q4.items():
  print(k, v / len(review_text[0]) *
          math.log(review_text.shape[0] / idf_words_q4[k],
                   10))

stories 0.0010318332982872755
magician 0.00245390334180775
psychic 0.004805281609100576
writing 0.0009213609771408053
wonder 0.001631582538880744


Answer to Q4:

(Format = word : idf-score through log base 10)

1. stories : 1.1174754620451195
2. magician : 2.6575773191777934
3. psychic : 2.602059991327962
4. writing : 0.9978339382434922
5. wonder : 1.7670038896078457

(Format = word : tfidf-score through log base 10)

1. stories : 0.0010318332982872755
2. magician : 0.00245390334180775
3. psychic : 0.004805281609100576
4. writing : 0.0009213609771408053
5. wonder : 0.001631582538880744

# Q5

In [0]:
unigrams_dict = {}
def save_unigrams(doc):
  unigram_list = word_tokenize(doc)
  for unigram in unigram_list:
    if unigram not in unigrams_dict:
      unigrams_dict[unigram] = 0
    unigrams_dict[unigram] += 1
  return doc
review_text.apply(save_unigrams)

unigrams_freq_list = [(k, v) for k, v in unigrams_dict.items()]
unigrams_freq_list.sort(key=lambda x: x[1], reverse=True)
del unigrams_dict

top_1000_unigrams = [unigram_pair[0] for unigram_pair in unigrams_freq_list[:1000]]
unigram_index_dict = {curr_gram: i for i, curr_gram in enumerate(top_1000_unigrams)}
unigram_idf_dict = {curr_gram: 0 for curr_gram in top_1000_unigrams}
del unigrams_freq_list

In [48]:
def save_q5_idf_counter(doc):
  token_set = set(word_tokenize(doc))
  for curr_word in token_set:
    if curr_word in unigram_idf_dict:
      unigram_idf_dict[curr_word] += 1
  return doc
review_text.apply(save_q5_idf_counter)

0       genuinely enthralling if collins or bernard di...
1       pretty decent the ending seemed a little rush ...
2       philippa created an intricate world composed o...
3       a very light good quirky read i felt like i wa...
4       it was a christmas gift from a friend how know...
                              ...                        
9995    as posted on kindleobsessed blog \n well ladie...
9996    rating 4 out of 5 stars  \n friendship murder ...
9997    very great main character  paddy i really like...
9998    loved it loved it loved it \n although the end...
9999    im not really sure how to review this book and...
Name: review_text, Length: 10000, dtype: object

In [0]:
def top_1000_tfidf_feat(doc):
  token = word_tokenize(doc)
  unigram_feat = [0] * len(top_1000_unigrams)
  for unigram in token:
    if unigram in unigram_index_dict:
      unigram_feat[unigram_index_dict[unigram]] += 1
  for unigram in set(token):
    if unigram in unigram_index_dict:
      unigram_feat[unigram_index_dict[unigram]] /= len(token)
      unigram_feat[unigram_index_dict[unigram]] *=\
        math.log(review_text.shape[0] / unigram_idf_dict[unigram],
                   10)
  
  unigram_feat.append(1)
  return unigram_feat

In [0]:
X_q5 = review_text.apply(top_1000_tfidf_feat)
X_q5 = X_q5.apply(lambda x: pd.Series(x))
y_rating = df['rating'].iloc[:10000]

In [0]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_q5, y_rating)
theta = clf.coef_

In [52]:
MSE(X_q5, theta, y_rating)

array([1.17836625])

In [0]:
del y_rating
del unigram_idf_dict
del unigram_index_dict
del top_1000_unigrams

Answer to Q5:

MSE = 1.17836625

# Q6

In [0]:
def cos_sim(a, b):
	dot_product = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	return dot_product / (norm_a * norm_b)

In [55]:
def get_cos_sim_rev(doc_tfidf):
  return cos_sim(doc_tfidf, X_q5.iloc[0])
X_q6 = X_q5.iloc[1:].apply(get_cos_sim_rev, axis=1)
top_index = X_q6[max(X_q6) == X_q6].index
print(max(X_q6))
print(top_index)
print(review_text[top_index].iloc[0])

0.9984838370882461
Int64Index([1595], dtype='int64')
review for entire anne series 
 the eight book anne of green gables series by l m montgomery published from 19081939 and read in the grosset  dunlap illustrated junior library version anne of green gables signet classic anne of avonlea and bantam classic all the rest 
 there is no avoiding disclosing that i already love these books i read them at least once a year and have for several years when i got to this years hankering for reading through the series i went ahead and moved them up on the read 1000 list anne of green gables is securely on that list and not by my own recommendation so were all good there i just expanded out to the whole series and made some new observations as i went along 
 now these books are highly revered by many many people in fact prince edward islandwhere the author spent some of her life and where anne takes placeis a tourist haven for anne fans and they come in droves to charter planes and experience the 

In [0]:
del X_q5
del X_q6

Answer for Q6:

cosine similarity: 0.9984838370882461

Row Index: 1595

Text:

review for entire anne series 
 the eight book anne of green gables series by l m montgomery published from 19081939 and read in the grosset  dunlap illustrated junior library version anne of green gables signet classic anne of avonlea and bantam classic all the rest 
 there is no avoiding disclosing that i already love these books i read them at least once a year and have for several years when i got to this years hankering for reading through the series i went ahead and moved them up on the read 1000 list anne of green gables is securely on that list and not by my own recommendation so were all good there i just expanded out to the whole series and made some new observations as i went along 
 now these books are highly revered by many many people in fact prince edward islandwhere the author spent some of her life and where anne takes placeis a tourist haven for anne fans and they come in droves to charter planes and experience the anne museum the anne tours the anne gift shops and savor a bit of the beautiful idyllic scenery that breathes through the anne books i wanted to take my honeymoon there but we were unable to afford it especially compared with the free stay in an everglades condo that came offered as a wedding gift but i still plan on going sometime my husband is well aware of this its like harry potter for romantics you can easily score anne of green gables embroidered pillows lithographs costumes or even soda not to mention the musical 
 l m montgomery lucy maud lived in the late 1800s and early 1900s around the east end of canada she wrote prolifically as a novelist and also for periodicals back when they carried fiction on a regular basis many of her short stories are collected in books and her twenty novels are still in publication including three other shorter series for emily pat and the story girl her writing was popular in its time beginning with publications of short stories upwards of twenty per year but nothing could or would surpass the love and respect for her very first novel anne of green gables 
 anne is a bit feminine for most men or even boys to read it is also a little flowery and descriptive for the modern reader and perhaps even a little short for its genre by modern standards but what makes it so popular i can think of no better reason than to say that anne is a book of complete escapism and where it takes the reader is not only or was not only a real place but also one that is wholesome and hopeful it hearkens back to simpler times to a society with clear morals and small interactions of looking to the future with optimism and adventure and living in the present with hard work and a noble outlook who wouldnt want to go there every once in awhile as i always say i not only enjoy the books but when i am done reading them i know what kind of person i want to grow up to be 
 the story is pretty straightforward anne is an orphan who has been handled roughly details dealt with tactfully over her first ten years and is accidentally adopted by a brother and sister in the backwoods small town of avonlea prince edward island canada anne is overly imaginative and accidentprone but also just a good soul and full of positivity her pride pits her against her faith and that is the struggle that we see unfolding in her relationships including her relationship with classmate gilbert blythe she grows up faces various hardships and bends in the road and becomes a woman with a family of her own in the later books 
 also note that the book is considered childrens literature my aunt gave me this book for my eleventh birthday but i found it too slow and descriptive at the time i read it through for the first time when i was fifteen i have loved it ever since then and can fully appreciate it as an adult if you can get your younger reader through the first couple chapters they too might fall in love with annes scrapes and antics her imagination and spirit she is after all eleven years old 
 at fifteen i quickly acquired the rest of the anne series read it all and then bought every montgomery book that they had in the bookstores remember those days before internet stores i am never disappointed with montgomery although some of her books are better than others as for the anne series i think the first and the last books anne of green gables and rilla of ingleside are the best with the fourth anne of windy poplars being my least favorite i think because of the epistolary form of course we need all those books because we want so desperately to see how anne and gilbertone of the classic loves of all literatureturn out next up i would recommend the three book emily trilogy 
 i dont have much else to say about anne of green gables or the series like anything overlyliterary i would highly recommend it to young girls up to fullgrown women or for family story time once you get through rachel lyndes sitting and watching things pick up and montgomerys anecdotes are sometimes funny sometimes exasperating sometimes heartwrenching but they always come with a little of marillas characteristic dose of moral and annes heavyhanded does of humanity 
 one disclaimer according to the times there are some nonpc moments in these books if i remember correctly they are almost all directed toward french immigrants 
 ive also heard there are some great tv productions and movies of the anne series or based on montgomerys writing about avonlea i can not however do you the service of screening them for you i am rather stubborn on this point anne is so solidified in my own mind that it would disturb me greatly to see an actress in the role and the other characters as well i dont need the movies i have the books but you may want to give them a try i have seen some of the animated series on pbs it doesnt mess with my sense of peace it is acceptable but nothing too notable the 1980s movies anne of green gables and anne of avonlea are i believe the most beloved 
 ill limit my quotes to the first three books which by the way were not the first three written or published the series was written and published quite out of order and was i believe determined partly by fan request 
 anne of green gables 
 and as for the risk theres risks in pretty much everything a body does in the world theres risks in peoples having children of their own if it comes to thatthey dont always turn out well p15 
 a mere man must have some vent for his emotions p40 
 youre both queer enough if thats what you mean by kindred spirits p47 
 if you must borrow trouble for pitys sake borrow it handier home p153 
 all things great are wound up with all things little p176 
 oh of course hes good agreed anne but he doesnt seem to get any comfort out of it if i could be good id dance and sing all day because i was glad of it p216 
 the trouble with you anne is that youre thinking too much about yourself you should just think of mrs allan and what would be nicest and most agreeable to her p226 
 a little appreciation sometimes does quite as much good as all the conscientious bringing up in the world p243 
 anne of avonlea 
 it takes all sorts of people to make a world as ive often heard but i think there are some who could be spared p125 
 not failure but low aim is a crime p131 to quote james russell lowell 
 and i think the violets are little snips of the sky that fell down when the angels cut holes for the stars to shine through and the buttercups are made out of old sunshine and i think the sweet peas will be butterflies when they go to heaven p166 
 you cant like different people the same way p167 
 id like kerrenhappuch if it happened to be your name i think people make their names nice or ugly just by what they are themselves p191 
 at seventeen dreams do satisfy you because you think the realities are awaiting you further on p199 
 every really beautiful thought was religious no matter what it was about or what day it was thought on p240 
 id rather look like you than be pretty p243 
 changes aint totally pleasant but theyre excellent things p266 
 in this world youve just got to hope for the best and prepare for the worst and take whatever god sends p268 
 im glad to be a woman with a garden and a work and a sorrow p276 
 anne of the island 
 it is never nice to have our old shrines desecrated even when we have outgrown them p2 
 we musnt let next week rob us of this weeks joy p3 
 i fancy its the unexpected things that give spice to life p4 
 as phil said it was the difference between being born and being made p48 
 the life of heaven must be begun here on earth p108 
 new shoes are smarter than old ones but the old ones are more comfortable p123 
 never mind thank goodness air and salvation are still free p126 
 what is to be will be said mrs rachel gloomily and what isnt to be sometimes happens p178 
 she felt very old and mature and wisewhich showed how young she was p182 
 review written for the starving artist blog

# Q7

In [0]:
q7_df = df.sample(30000, random_state=1)
q7_review_text = q7_df['review_text'].apply(lambda doc: doc.lower())
# re.sub('[%s]' % re.escape(string.punctuation), '', doc.lower()) 
train_X = q7_review_text.iloc[:10000]
val_X = q7_review_text.iloc[10000:20000]
test_X = q7_review_text.iloc[20000:]
y_train = q7_df['rating'].iloc[:10000]
y_val = q7_df['rating'].iloc[10000:20000]
y_test = q7_df['rating'].iloc[20000:]

del q7_df
del q7_review_text

In [88]:
curr_gram_dict = {}
curr_gram_spec = 'bi'
keep_punc = True
curr_tfidf = True

curr_gram_idf_dict = {}
curr_gram_index_dict = {}

def curr_pref_list(doc):
  if keep_punc:
    token = re.findall(r"[\w]+|['.,<>?\/#!$%\^&\*;:{}\[\]=+\-_`~()\"]",
                       doc)
  else:
    doc = re.sub('[%s]' % re.escape(string.punctuation), '', doc)
    token = word_tokenize(doc)
  if curr_gram_spec == 'uni':
    curr_pref_list = token
  else:
    curr_pref_list = list(ngrams(token, 2))
  return curr_pref_list

def save_curr_grams(curr_X):
  def save_curr_grams_(doc):
    global curr_gram_dict

    curr_gram_list = curr_pref_list(doc)
    for curr_gram in curr_gram_list:
      if curr_gram not in curr_gram_dict:
        curr_gram_dict[curr_gram] = 0
      curr_gram_dict[curr_gram] += 1
    return doc
  return curr_X.apply(save_curr_grams_)

def save_curr_idf(curr_X):
  global curr_gram_dict
  global curr_gram_index_dict
  global curr_gram_idf_dict

  curr_freq_list = [(k, v) for k, v in curr_gram_dict.items()]
  curr_freq_list.sort(key=lambda x: x[1], reverse=True)
  top_1000_grams = [curr_gram_pair[0] for curr_gram_pair in curr_freq_list[:1000]]
  curr_gram_index_dict = {curr_gram: i for i, curr_gram in enumerate(top_1000_grams)}
  if curr_tfidf:
    curr_gram_idf_dict = {curr_gram: 0 for curr_gram in top_1000_grams}
  else:
    return curr_X
  def save_curr_idf_(doc):
    global curr_gram_idf_dict

    curr_gram_set = set(curr_pref_list(doc))
    for curr_gram in curr_gram_set:
      if curr_gram in curr_gram_idf_dict:
        curr_gram_idf_dict[curr_gram] += 1
    return doc
  return curr_X.apply(save_curr_idf_)

def top_1000_feat(curr_X):
  def top_1000_feat_(doc):
    global curr_gram_index_dict
    global curr_gram_idf_dict

    curr_gram_list = curr_pref_list(doc)
    curr_top_feat = [0] * 1000
    for curr_gram in curr_gram_list:
      if curr_gram in curr_gram_index_dict:
        curr_top_feat[curr_gram_index_dict[curr_gram]] += 1
    if not curr_tfidf:
      curr_top_feat.append(1)
      return pd.Series(curr_top_feat)
    for curr_gram in set(curr_gram_list):
      if curr_gram in curr_gram_index_dict:
        curr_top_feat[curr_gram_index_dict[curr_gram]] /= len(curr_gram_list)
        curr_top_feat[curr_gram_index_dict[curr_gram]] *=\
          math.log(train_X.shape[0] / curr_gram_idf_dict[curr_gram],
                    10)
    curr_top_feat.append(1)
    return pd.Series(curr_top_feat)
  return curr_X.apply(top_1000_feat_)

preprocessor = Pipeline(steps=[
        ('gram', FunctionTransformer(save_curr_grams,
                            validate=False)),
        ('idf', FunctionTransformer(save_curr_idf,
                            validate=False)),
        ('top_feat', FunctionTransformer(top_1000_feat,
                            validate=False))])

top_scores = [[0, math.inf, 0] for j in range(8)]
curr_score = 0
prev_curr_gram_index_dict = {}
prev_curr_gram_idf_dict = {}
for curr_gram_spec in ['uni', 'bi']:
  for keep_punc in [True, False]:
    for curr_tfidf in [True, False]:
      curr_gram_dict = {}
      curr_gram_idf_dict = {}
      curr_gram_index_dict = {}

      prep_train_X = preprocessor.transform(train_X)
      prep_val_X = top_1000_feat(val_X)
      prep_test_X = top_1000_feat(test_X)

      for C_val in [0.01, 0.1, 1, 10, 100]:
        clf = Pipeline(steps=[
                # ('preprocessor', preprocessor),
                ('logistic', linear_model.Ridge(C_val, fit_intercept=False)),
        ])
        clf.fit(prep_train_X, y_train)
        
        theta = clf.named_steps['logistic'].coef_

        print('C_val:', C_val)
        print('MSE for val:', MSE(prep_val_X, theta, y_val))
        print('MSE for test:', MSE(prep_test_X, theta, y_test))

        if top_scores[curr_score][1] > MSE(prep_val_X, theta, y_val):
          top_scores[curr_score][0] = C_val
          top_scores[curr_score][1] = MSE(prep_val_X, theta, y_val)
          top_scores[curr_score][2] = MSE(prep_test_X, theta, y_test)
      
      print('------------------------------------------------------------------')
      print(curr_gram_spec, keep_punc, curr_tfidf)
      print('TOP C_val:', top_scores[curr_score][0])
      print('TOP MSE for val:', top_scores[curr_score][1])
      print('TOP MSE for test:', top_scores[curr_score][2])
      print('------------------------------------------------------------------')
      curr_score += 1

C_val: 0.01
MSE for val: [1.1717382]
MSE for test: [1.11666483]
C_val: 0.1
MSE for val: [1.14395191]
MSE for test: [1.08917372]
C_val: 1
MSE for val: [1.27955671]
MSE for test: [1.22557528]
C_val: 10
MSE for val: [1.38729721]
MSE for test: [1.33641664]
C_val: 100
MSE for val: [1.40531231]
MSE for test: [1.35751494]
------------------------------------------------------------------
uni True True
TOP C_val: 0.1
TOP MSE for val: [1.14395191]
TOP MSE for test: [1.08917372]
------------------------------------------------------------------
C_val: 0.01
MSE for val: [1.30788342]
MSE for test: [1.23206033]
C_val: 0.1
MSE for val: [1.30776975]
MSE for test: [1.23195853]
C_val: 1
MSE for val: [1.30664442]
MSE for test: [1.23095109]
C_val: 10
MSE for val: [1.29643232]
MSE for test: [1.2218377]
C_val: 100
MSE for val: [1.24880882]
MSE for test: [1.1803991]
------------------------------------------------------------------
uni True False
TOP C_val: 100
TOP MSE for val: [1.24880882]
TOP MSE for test

In [104]:
# curr_score = 0
# for col_a in ['Unigram', 'Bigram']:
#   for col_b in ['Preserve', 'Remove']:
#     for col_c in ['TFIDF', 'Word Count']:
#       top_scores[curr_score] = [col_a, col_b, col_c] + top_scores[curr_score]
#       curr_score += 1
pd.DataFrame(top_scores,
             columns=['Unigram v.s. Bigram',
                      'Preserve v.s. Remove Punctuation',
                      'TFIDF v.s. Word Counts',
                      'Regularization Parameter',
                      'MSE on Validation',
                      'MSE on Test'])

Unnamed: 0,Unigram v.s. Bigram,Preserve v.s. Remove Punctuation,TFIDF v.s. Word Counts,Regularization Parameter,MSE on Validation,MSE on Test
0,Unigram,Preserve,TFIDF,0.1,[1.1439519082131546],[1.0891737182200811]
1,Unigram,Preserve,Word Count,100.0,[1.2488088239460031],[1.180399104569241]
2,Unigram,Remove,TFIDF,0.1,[1.1444348196780214],[1.0900705908180979]
3,Unigram,Remove,Word Count,100.0,[1.2421320381073877],[1.1772861075892005]
4,Bigram,Preserve,TFIDF,0.1,[1.2098393631698006],[1.1527314712509042]
5,Bigram,Preserve,Word Count,100.0,[1.2274757253057103],[1.1845733604836595]
6,Bigram,Remove,TFIDF,0.1,[1.2302925879632027],[1.169080802677434]
7,Bigram,Remove,Word Count,100.0,[1.2295289082311316],[1.1726839259931665]
