In [1]:
import gzip
import random
import numpy as np
import string
import math
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wingfungleung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/wingfungleung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def parse(f):
    for l in gzip.open(f):
        yield eval(l)
dataset = list(parse('train_Category.json.gz'))[:10000]

In [3]:
len(dataset)

10000

In [4]:
#shuffling data
random.shuffle(dataset)

1. How many unique bigrams are there amongst the reviews? List the 5 most-frequently-occurring bigrams along with their number of occurrences in the corpus (1 mark).


In [5]:
#Unique words in the train set
biwordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    nltk_tokens = nltk.word_tokenize(r)
    bigrams = list(nltk.bigrams(nltk_tokens))
    for i in bigrams:
        biwordCount[i] += 1

In [6]:
bicounts = [(biwordCount[w], w) for w in biwordCount]
bicounts.sort()
bicounts.reverse()
common_five_words = [(x[0], x[1]) for x in bicounts[:5]]
common_five_words

[(4441, ('this', 'game')),
 (4263, ('the', 'game')),
 (3359, ('of', 'the')),
 (2041, ('if', 'you')),
 (2017, ('in', 'the'))]

In [7]:
print('number of unique bigrams: ', len(biwordCount))

number of unique bigrams:  256326


2. The code provided performs least squares using the 1000 most common unigrams. Adapt it to use the 1000 most common bigrams and report the MSE obtained using the new predictor (use bigrams only, i.e., not unigrams+bigrams) (1 mark). Note that the code performs regularized regression with a regularization parameter of 1.0. The prediction target should be log2(hours + 1) (i.e., our transformed time variable).

In [8]:
common_1000_biwords = [(x[0], x[1]) for x in bicounts[:1000]]
biwords = [x[1] for x in bicounts[:1000]]
wordId = dict(zip(biwords, range(len(biwords))))
wordSet = set(biwords)


def feature(datum):
    feat = [0]*len(biwords)
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    nltk_tokens = nltk.word_tokenize(r)
    bigrams = list(nltk.bigrams(nltk_tokens))
    for i in bigrams:
        if i in biwords:
            feat[wordId[i]] += 1
    feat.append(1) #offset
    return feat

X = [feature(d) for d in dataset]

y = [d['hours'] for d in dataset]
y = [math.log2(d+1) for d in y] #transformed

# # Regression
# theta,residuals,rank,s = np.linalg.lstsq(X, y, rcond=None)
# theta

In [9]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
print('mean square error: ', mean_squared_error(y, predictions))

mean square error:  4.391761206363515


3. Repeat the above experiment using unigrams and bigrams, still considering the 1000 most common. That is, your model will still use 1000 features (plus an offset), but those 1000 features will be some combination of unigrams and bigrams. Report the MSE obtained using the new predictor (1 mark).

In [10]:
#unigram
uniwordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        uniwordCount[w] += 1
    
unicounts = [(uniwordCount[w], w) for w in uniwordCount]
unicounts.sort()
unicounts.reverse()
common_1000_uniwords = [(x[0], x[1]) for x in unicounts[:1000]]

In [11]:
#combine bigrams and unigrams words
unibi_lst = common_1000_biwords + common_1000_uniwords

In [12]:
#shuffle data
random.shuffle(unibi_lst)

In [13]:
#1000 features with some combination of unigrams and bigrams.
unibiCount = unibi_lst[:1000]

common_1000_unibiwords = [(x[0], x[1]) for x in unibiCount[:1000]]
unibiwords = [x[1] for x in unibiCount[:1000]]
wordId = dict(zip(unibiwords, range(len(unibiwords))))
wordSet = set(unibiwords)

In [14]:
def feature2(datum):
    feat = [0]*len(unibiwords)
    #bigrams
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    nltk_tokens = nltk.word_tokenize(r)
    bigrams = list(nltk.bigrams(nltk_tokens))
    for i in bigrams:
        if i in unibiwords:
            feat[wordId[i]] += 1
    #unigrams
    r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in unibiwords:
            feat[wordId[w]] += 1
    feat.append(1) #offset
    return feat

In [15]:
X = [feature2(d) for d in dataset]

y = [d['hours'] for d in dataset]
y = [math.log2(d+1) for d in y] #transformed

# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
print('mean square error: ', mean_squared_error(y, predictions))

mean square error:  4.284546029778992


4. What is the inverse document frequency of the words ‘destiny’, ‘annoying’, ‘likeable’, ‘chapter’, and ‘interesting’? What are their tf-idf scores in review ID r75487422 (using log base 10, unigrams only, following the first definition of tf-idf given in the slides) (1 mark)?

In [16]:
words = [ 'destiny', 'annoying', 'likeable', 'chapter', 'interesting']
wordDict = dict.fromkeys(words, 0)
word_list = []
for d in dataset:
    wordDict = dict.fromkeys(words, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            wordDict[w] += 1
    word_list.append(wordDict)

In [17]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idf = computeIDF(word_list)
idf

{'destiny': 7.824046010856292,
 'annoying': 4.2336066295556085,
 'likeable': 7.1308988302963465,
 'chapter': 5.115995809754082,
 'interesting': 3.128121461599737}

In [18]:
tfidf_list = []
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)
for i in range(len(dataset)):
    tfidf_list.append(computeTFIDF(word_list[i],idf))

In [19]:
index = 0
for d in dataset:
    if d['reviewID'] == 'r75487422':
        print(d['text'])
        break  
    index += 1

Short Review:
A good starting chapter for this series, despite the main character being annoying (for now) and a short length. The story is good and actually gets more interesting. Worth the try.
Long Review:
Blackwell Legacy is the first on the series of (supposedly) 5 games that talks about the main protagonist, Rosangela Blackwell, as being a so called Medium, and in this first chapter we get to know how her story will start and how she will meet her adventure companion Joey...and really, that's really all for for now and that's not a bad thing, because in a way this game wants to show how hard her new job is, and that she cannot escape her destiny as a medium.
My biggest complain for this chapter, except the short length, it's the main protagonist being a "bit" too annoying to be likeable, and most of her dialogues will always be about complaining or just be annoyed. Understandable, sure, but lighten' up will ya!?
However, considering that in the next installments she will be much 

In [20]:
tfidf_list[index]

{'destiny': 7.824046010856292,
 'annoying': 8.467213259111217,
 'likeable': 14.261797660592693,
 'chapter': 15.347987429262247,
 'interesting': 6.256242923199474}

5. Adapt your unigram model to use the tfidf scores of words, rather than a bag-of-words representation. That is, rather than your features containing the word counts for the 1000 most common unigrams, it should contain tfidf scores for the 1000 most common unigrams. Report the MSE of this new model (1 mark).

In [21]:
#unigram
uniwordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in dataset:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        uniwordCount[w] += 1
    
unicounts = [(uniwordCount[w], w) for w in uniwordCount]
unicounts.sort()
unicounts.reverse()
common_1000_uniwords = [(x[0], x[1]) for x in unicounts[:1000]]
uniwords = [ d[1] for d in common_1000_uniwords]

In [22]:
word_list = []
for d in dataset:
    wordDict = dict.fromkeys(uniwords, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in uniwords:
            wordDict[w] += 1
    word_list.append(wordDict)

In [23]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idf = computeIDF(word_list)

In [24]:
tfidf_list = []
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)
for i in range(len(dataset)):
    tfidf_list.append(computeTFIDF(word_list[i],idf))

In [25]:
wordId = dict(zip(uniwords, range(len(uniwords))))
wordSet = set(uniwords)
def feature3(index, datum):
    feat = [0]*len(uniwords)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in uniwords:
            feat[wordId[w]] = tfidf_list[index][w]
    feat.append(1) #offset
    return feat

In [26]:
X = [feature3(i, dataset[i]) for i in range(len(dataset))]
y = [d['hours'] for d in dataset]
y = [math.log2(d+1) for d in y] #transformed

In [27]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
print('mean square error: ', mean_squared_error(y, predictions))

mean square error:  5.195723126763592


6. Which other review has the highest cosine similarity compared to review ID r75487422, in terms of their tf-idf representations (considering unigrams only). Provide the reviewID, or the text of the review (1 mark)?

In [31]:
index = 0
for d in dataset:
    if d['reviewID'] == 'r75487422':
        break  
    index += 1

score = []
for i in range(len(dataset)):
    if i != index:
        score.append((cosine_similarity([X[i]], [X[index]])[0][0], i))

In [32]:
max_score = max(score)
max_score

(0.9833646596609258, 7919)

In [34]:
dataset[max_score[1]]['text']

'Tried to play this game in a Petco the other day and got escorted out by security. They thought I only put one ferret down my pants though, so I got out of there with four of those ♥♥♥♥ers for free! Man ferrets are ♥♥♥♥ing crazy and after what I experienced I can only assume that these creatures see human taint as their biggest natural predator. They ripped me up real good and turned me into a human sock puppet.\nHey did you know ferrets arew the only species in the animal kingdom that feel regret? In the case of this game, I\'m pretty sure all of those little ♥♥♥♥s regret being born in ♥♥♥♥ing Scotland, the only country where this♥♥♥♥♥♥won\'t get you a court date. Then again in America we have a sport we like to call "is this cop gonna shoot me?" but much like basketball it\'s not for white people, so hey I guess every country has its quirks!\nAnyway thanks Petco for this new fur coat I just made, gonna wear it to my next crisis actor meet n greet at bohemian grove. There\'s no ferre

7. Implement a validation pipeline for this same data, by randomly shuffling the data, using 10,000 reviews for training, another 100,000 for validation, and another 10,000 for testing. Consider regularization parameters in the range{0.01, 0.1, 1, 10, 100}, and report MSEs on the test set for the model that performs best on the validation set. Using this pipeline, compare the following alternatives in terms of their performance(all using 1,000 dimensional word features)

In [31]:
def parse(f):
    for l in gzip.open(f):
        yield eval(l)
dataset = list(parse('train_Category.json.gz'))
#shuffling data
random.shuffle(dataset)
train_set = dataset[:10000]
validation_set = dataset[10000:20000]
test_set = dataset[20000:30000]

In [32]:
#Unigrams, keep punctuation, with tfidf scores
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

uniwordCount = defaultdict(int)
for d in train_set:
    r = ''.join([c for c in d['text'].lower()])
    tokens = word_tokenize(r)
    for w in tokens:
        uniwordCount[w] += 1

unicounts = [(uniwordCount[w], w) for w in uniwordCount]
unicounts.sort()
unicounts.reverse()
uniwords = [ d[1] for d in unicounts]


In [None]:
#tfidf
word_list = []
for d in train_set:
    wordDict = dict.fromkeys(uniwords, 0)
    r = ''.join([c for c in d['text'].lower()])
    tokens = word_tokenize(r)
    for w in tokens:
        if w in uniwords:
            wordDict[w] += 1
    word_list.append(wordDict)
    
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idf = computeIDF(word_list)

tfidf_list = []
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)
for i in range(len(train_set)):
    tfidf_list.append(computeTFIDF(word_list[i],idf))

In [None]:
wordId = dict(zip(uniwords, range(len(uniwords))))
wordSet = set(uniwords)
def model1(index, datum):
    feat = [0]*len(uniwords)
    r = ''.join([c for c in d['text'].lower()])
    tokens = word_tokenize(r)
    for w in tokens:
        if w in uniwords:
            feat[wordId[w]] = tfidf_list[index][w]
    feat.append(1) #offset
    return feat

X = [model1(i, train_set[i]) for i in range(len(train_set))]
y = [d['hours'] for d in train_set]
y = [math.log2(d+1) for d in y] #transformed

In [None]:
# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_

In [None]:
#mse for validation set
uniwordCount = defaultdict(int)
for d in validation_set:
    r = ''.join([c for c in d['text'].lower()])
    tokens = word_tokenize(r)
    for w in tokens:
        uniwordCount[w] += 1

unicounts = [(uniwordCount[w], w) for w in uniwordCount]
unicounts.sort()
unicounts.reverse()
uniwords = [ d[1] for d in unicounts]

word_list = []
for d in validation_set:
    wordDict = dict.fromkeys(uniwords, 0)
    r = ''.join([c for c in d['text'].lower()])
    tokens = word_tokenize(r)
    for w in tokens:
        if w in uniwords:
            wordDict[w] += 1
    word_list.append(wordDict)
    
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idf = computeIDF(word_list)

tfidf_list = []
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)
for i in range(len(validation_set)):
    tfidf_list.append(computeTFIDF(word_list[i],idf))
    

wordId = dict(zip(uniwords, range(len(uniwords))))
wordSet = set(uniwords)
def model1(index, datum):
    feat = [0]*len(uniwords)
    r = ''.join([c for c in d['text'].lower()])
    tokens = word_tokenize(r)
    for w in tokens:
        if w in uniwords:
            feat[wordId[w]] = tfidf_list[index][w]
    feat.append(1) #offset
    return feat

X = [model1(i, validation_set[i]) for i in range(len(validation_set))]
y = [d['hours'] for d in validation_set]
y = [math.log2(d+1) for d in y] #transformed

predictions = clf.predict(X)
print('mean square error: ', mean_squared_error(y, predictions))

In [None]:
#Unigrams, discard punctuation, with tfidf scores
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

uniwordCount = defaultdict(int)
for d in train_set:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    tokens = word_tokenize(r)
    for w in tokens:
        uniwordCount[w] += 1

unicounts = [(uniwordCount[w], w) for w in uniwordCount]
unicounts.sort()
unicounts.reverse()
uniwords = [ d[1] for d in unicounts]



#tfidf
word_list = []
for d in train_set:
    wordDict = dict.fromkeys(uniwords, 0)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    tokens = word_tokenize(r)
    for w in tokens:
        if w in uniwords:
            wordDict[w] += 1
    word_list.append(wordDict)
    
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idf = computeIDF(word_list)

tfidf_list = []
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)
for i in range(len(train_set)):
    tfidf_list.append(computeTFIDF(word_list[i],idf))
    
    
wordId = dict(zip(uniwords, range(len(uniwords))))
wordSet = set(uniwords)
def model1(index, datum):
    feat = [0]*len(uniwords)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in uniwords:
            feat[wordId[w]] = tfidf_list[index][w]
    feat.append(1) #offset
    return feat

X = [model1(i, train_set[i]) for i in range(len(train_set))]
y = [d['hours'] for d in train_set]
y = [math.log2(d+1) for d in y] #transformed


# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_


#mse for validation set
uniwordCount = defaultdict(int)
for d in validation_set:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    tokens = word_tokenize(r)
    for w in tokens:
        uniwordCount[w] += 1

unicounts = [(uniwordCount[w], w) for w in uniwordCount]
unicounts.sort()
unicounts.reverse()
uniwords = [ d[1] for d in unicounts]

word_list = []
for d in validation_set:
    wordDict = dict.fromkeys(uniwords, 0)
    r = ''.join([c for c in d['text'].lower()])
    tokens = word_tokenize(r)
    for w in tokens:
        if w in uniwords:
            wordDict[w] += 1
    word_list.append(wordDict)
    
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idf = computeIDF(word_list)

tfidf_list = []
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)
for i in range(len(validation_set)):
    tfidf_list.append(computeTFIDF(word_list[i],idf))
    

wordId = dict(zip(uniwords, range(len(uniwords))))
wordSet = set(uniwords)
def model1(index, datum):
    feat = [0]*len(uniwords)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in uniwords:
            feat[wordId[w]] = tfidf_list[index][w]
    feat.append(1) #offset
    return feat

X = [model1(i, validation_set[i]) for i in range(len(validation_set))]
y = [d['hours'] for d in validation_set]
y = [math.log2(d+1) for d in y] #transformed

predictions = clf.predict(X)
print('mean square error: ', mean_squared_error(y, predictions))

In [None]:
#unigrams, without punctuation, with counts
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

uniwordCount = defaultdict(int)
for d in train_set:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    tokens = word_tokenize(r)
    for w in tokens:
        uniwordCount[w] += 1

unicounts = [(uniwordCount[w], w) for w in uniwordCount]
unicounts.sort()
unicounts.reverse()
uniwords = [ d[1] for d in unicounts]



wordId = dict(zip(uniwords, range(len(uniwords))))
wordSet = set(uniwords)
def model2(index, datum):
    feat = [0]*len(uniwords)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in uniwords:
            feat[wordId[w]] = tfidf_list[index][w]
    feat.append(1) #offset
    return feat

X = [model1(i, train_set[i]) for i in range(len(train_set))]
y = [d['hours'] for d in train_set]
y = [math.log2(d+1) for d in y] #transformed


# Regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_


#mse for validation set
uniwordCount = defaultdict(int)
for d in validation_set:
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    tokens = word_tokenize(r)
    for w in tokens:
        uniwordCount[w] += 1
    

unicounts = [(uniwordCount[w], w) for w in uniwordCount]
unicounts.sort()
unicounts.reverse()
uniwords = [ d[1] for d in unicounts]

word_list = []
for d in validation_set:
    wordDict = dict.fromkeys(uniwords, 0)
    r = ''.join([c for c in d['text'].lower()])
    tokens = word_tokenize(r)
    for w in tokens:
        if w in uniwords:
            wordDict[w] += 1
    word_list.append(wordDict)
    
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idf = computeIDF(word_list)

tfidf_list = []
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)
for i in range(len(validation_set)):
    tfidf_list.append(computeTFIDF(word_list[i],idf))
    

wordId = dict(zip(uniwords, range(len(uniwords))))
wordSet = set(uniwords)
def model2(index, datum):
    feat = [0]*len(uniwords)
    r = ''.join([c for c in d['text'].lower() if not c in punctuation])
    for w in r.split():
        if w in uniwords:
            feat[wordId[w]] = tfidf_list[index][w]
    feat.append(1) #offset
    return feat

X = [model1(i, validation_set[i]) for i in range(len(validation_set))]
y = [d['hours'] for d in validation_set]
y = [math.log2(d+1) for d in y] #transformed

predictions = clf.predict(X)
print('mean square error: ', mean_squared_error(y, predictions))