# Section 1 (Regression)

In [1]:
import gzip
import sklearn
from sklearn import linear_model
from collections import defaultdict
import random
import math
import numpy as np
from scipy import spatial

In [2]:
def parse(f):
    for l in gzip.open(f):
        yield eval(l)

In [3]:
# Download data from below:
# https://drive.google.com/uc?id=1V4MLeoEiPQdocCbUHjR_7L9ZmxTufPFe
# Or:
# https://cseweb.ucsd.edu/classes/fa20/cse258-a/files/
dataset = list(parse("goodreads_reviews_comics_graphic.json.gz"))

In [4]:
len(dataset)

542338

In [5]:
dataset[1]

{'user_id': 'bafc2d50014200cda7cb2b6acd60cd73',
 'book_id': '6315584',
 'review_id': '72f1229aba5a88f9e72f0dcdc007dd22',
 'rating': 4,
 'review_text': "I've never really liked Spider-Man. I am, however, a huge fan of the Dresden Files. Jim Butcher is clever and sarcastic and probably the perfect choice to pen a superhero novel. I really enjoyed this book!",
 'date_added': 'Wed Aug 10 06:06:48 -0700 2016',
 'date_updated': 'Fri Aug 12 08:49:54 -0700 2016',
 'read_at': 'Fri Aug 12 08:49:54 -0700 2016',
 'started_at': 'Wed Aug 10 00:00:00 -0700 2016',
 'n_votes': 0,
 'n_comments': 0}

In [6]:
def feature(d):
    dayFeat = [0]*7 # One hot encoding of day of week
    dayDict = {"Mon":0, "Tue":1, "Wed":2, "Thu":3, "Fri":4, "Sat":5, "Sun":6}
    dayFeat[dayDict[d['date_added'][:3]]] = 1
    return [1, d['rating'], d['n_comments']] + dayFeat[1:]

In [7]:
X = [feature(d) for d in dataset]
y = [len(d['review_text']) for d in dataset]

In [8]:
model = sklearn.linear_model.LinearRegression()

In [9]:
model.fit(X,y)

LinearRegression()

In [10]:
yPred = model.predict(X)

In [11]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [12]:
mse = MSE(yPred, y)

In [13]:
mse

624989.9720071985

#### Question 1a / 2a

In [14]:
# A disadvantage would be overfitting, and may not represent an overall good predictor,
# as it would leave out many data points that are outliers. We also lose many data points resulting in smaller training,
# and would not be good for predicting data with large review lengths

# An advantage would be that it reduces the MSE overall as our model is less dynamic
# It also decreases the time to train as we have fewer data points


# Choose the kept values based on Interquartile range, and build new X and y within that range. An interquartile range
# selects within a certain percentile, we can use this to discard values too low, and too high. I chose this because it is a
# reasonable range, representing a bell curve, and removing the majority of outliers
high, low = np.percentile(y, [75, 25]) # 247, 94, chosen as the 1st and 3rd quarter labels, because outliers are too high or low
iqr = high - low
X_outlier = []
y_outlier = []

# Remove outliers
for i in range(len(y)):
    if (not y[i] <= low - iqr * 1.5 and not y[i] >= high + iqr * 1.5):
        X_outlier.append(X[i])
        y_outlier.append(y[i])

# Predict
model = sklearn.linear_model.LinearRegression()     
model.fit(X_outlier, y_outlier)
yPred = model.predict(X_outlier)
mse = MSE(yPred, y_outlier)

# As we can see, the MSE is better as it predicts non-outliers much better, which may show an improved predictor. However
# we are dealing with less information so we cannot say much for variable review lengths
mse

93882.12153472501

#### Question 1b/2b

In [15]:
# A disadvantage would be that the significance of the variance is lost, i.e. super long reviews are treated as much shorter

# An advantage would be a better representation of length of reviews, as they are less dynamic
# and better controlled. The curve of the model is also flattened to better fit a straight line, with much better MSE


# Apply square root to length of reviews
# I chose this as it is a good way to transform y to reduced values, similar to log
y_sqrt = [math.sqrt(len(d['review_text'])) for d in dataset]

# Normalize length of reviews
# I chose this as it is an intuitive way to reduce y
max = np.max(y)
y_normalize = [len(d['review_text']) / max for d in dataset]


# Predict
model = sklearn.linear_model.LinearRegression()
model.fit(X, y_sqrt)
yPred = model.predict(X)
mse_sqrt = MSE(yPred, y_sqrt)

model = sklearn.linear_model.LinearRegression()
model.fit(X, y_normalize)
yPred = model.predict(X)
mse_normalize = MSE(yPred, y_normalize)

# Overall I would choose the normalized predictor as the MSE is better. The performances were vastly improved to the original
# based on our MSE, but that may have to due with our y having smaller values as well
mse_sqrt, mse_normalize

(158.4185033539574, 0.0015632564605444928)

#### Question 1c/2c

In [16]:
# A simple classifier, that just decides whether it is above or below median. I chose this because it is deterministic
def binary_outcome(median, val):
    if val <= median:
        return 0
    else:
        return 1

In [17]:
# Disadvantages include getting less information out of predicting any dataset, since we can only get two possibilities,
# instead of a value
# It also cannot estimate a review length beyond whether it is below or above a value

# An advantage would be more clear in a prediction with a true or false, rather than an obscure value


# Predict accuracy
median = np.median(y)
model = sklearn.linear_model.LinearRegression()
model.fit(X, y)
yPred = model.predict(X)
correct = 0
for i in range(len(y)):
    if binary_outcome(median, y[i]) == binary_outcome(median, yPred[i]):
        correct += 1
accuracy = correct / len(y)

# Accuracy is 50% so this predictor is trivial and does not perform well. This may be due to the many outliers
accuracy

0.4989102736669752

#### Question 1d/2d

In [18]:
# Chose this objective as it is less sensitive to outliers. Compared to MSE, the difference is not squared
def MAE(predictions, labels):
    differences = [abs((x-y)) for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [19]:
# Disadvantages include it being less effective with a tight dataset
# It fails to punish large errors

# Advantages include it being less sensitive to outliers, as error differences are not exponentiated


# Predict
model = sklearn.linear_model.LinearRegression()     
model.fit(X, y)
yPred = model.predict(X)
mae = MAE(yPred, y)

# Our value is smaller than the MSE as it is less sensitive to outliers. This reduces our error number, but this approach
# might not be good for tighter datasets
mae

464.48566706945536

#### Question 3

Note: You can insert an image (e.g. containing a hand-written solution) via edit->insert image

#### Question 4

##### a) 
Good features to estimate tips of a person taking a taxicab trip may be the following:
    
    * religion: some faiths believe in being generous
    * ethnicity: race of driver
    * salary: a higher salary means more to give, or a lower salary may make someone more generous
    
Good features of a trip itself may be:
    
    * time_taken: how long to complete a trip
    * number_of_tolls: how many toll roads were taken
    * amount_of_passengers: how many passengers were present
    * weather: whether it was raining, snowing, etc
    * geography: urban, rural, etc
    * ethnicity_match: if driver and passengers were all the same ethnicity

##### b)
    * religion: can be one-hot-encoded for a finite amount of religions
    * ethnicity: race of driver
    * salary: can be represented as an integer
    
    * time_taken: can be represented as an integer of the number of minutes
    * number_of_tolls: can be represented as an integer
    * amount_of_passengers: can be represented as an integer
    * weather: can be categorically encoded. i.e. 0 for sunny, 1 for raining, ...
    * geography: can be categorically encoded. i.e. 0 for urban, 1 for rural, ...
    * ethnicity_match: can be a boolean. 1 if they're all the same ethnicty, 0 otherwise

##### c)
This would best be represented as a regression problem. The goal would be to predict the tip that would be given by the passenger(s). Transforming with reduction or removal might be useful for salary might be useful as there may be people who make 0 or too much money e.g. a billionaire, however it shouldn't be a problem for most use cases as salaries usually range to a fixed amount. Transforming other features may be useful as well, e.g. if there are only Buddhists who ride a taxi in a dataset

# Section 2 (Classification)

#### Question 5

#### Question 6

# Section 3 (Recommender Systems)

In [20]:
# Code based on http://cseweb.ucsd.edu/classes/fa20/cse258-a/code/workbook4.html

In [21]:
# Utility data structures
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
usersPerItem = defaultdict(set) # U_i from class slides
itemsPerUser = defaultdict(set) # I_u from class slides

In [22]:
for d in dataset:
    user,item = d['user_id'], d['book_id']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)

In [23]:
ratingMean = sum([d['rating'] for d in dataset]) / len(dataset)

In [24]:
ratingMean

3.778138356523054

In [25]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [26]:
# This function should be re-defined for each of your model variants
def predictRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

In [27]:
# Example use:

In [28]:
dataset[1]

{'user_id': 'bafc2d50014200cda7cb2b6acd60cd73',
 'book_id': '6315584',
 'review_id': '72f1229aba5a88f9e72f0dcdc007dd22',
 'rating': 4,
 'review_text': "I've never really liked Spider-Man. I am, however, a huge fan of the Dresden Files. Jim Butcher is clever and sarcastic and probably the perfect choice to pen a superhero novel. I really enjoyed this book!",
 'date_added': 'Wed Aug 10 06:06:48 -0700 2016',
 'date_updated': 'Fri Aug 12 08:49:54 -0700 2016',
 'read_at': 'Fri Aug 12 08:49:54 -0700 2016',
 'started_at': 'Wed Aug 10 00:00:00 -0700 2016',
 'n_votes': 0,
 'n_comments': 0}

In [29]:
u,i = dataset[1]['user_id'], dataset[1]['book_id']
predictRating(u,i)

4.44493246042927

In [30]:
sample = random.sample(dataset, 1000)
sampleLabels = [d['rating'] for d in sample]

In [31]:
# Baseline prediction
alwaysPredictMean = [ratingMean for d in sample]

In [32]:
# Prediction using item-to-item similarity above
cfPredictions = [predictRating(d['user_id'], d['book_id']) for d in sample]

In [33]:
# Baseline accuracy
MSE(alwaysPredictMean, sampleLabels)

1.3390861315078861

In [34]:
# Item-to-item similarity accuracy
MSE(cfPredictions, sampleLabels)

1.0296334594874024

#### Question 7 (a) (i.e., the first of your three variants)

In [74]:
# Interchange users and items from the previous function
def predictRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerItem[item]:
        u2 = d['user_id']
        if u2 == user: continue
        ratings.append(d['rating'])
        similarities.append(Jaccard(itemsPerUser[user],itemsPerUser[u2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

In [75]:
# Prediction using user-to-user similarity above
cfPredictions = [predictRating(d['user_id'], d['book_id']) for d in sample]

In [76]:
# Item-to-item similarity accuracy
MSE(cfPredictions, sampleLabels)

1.3970801064452454

#### Question 7 (b)

In [80]:
# Choose overlap coefficient instead of Jaccard. This divides the numerator by the min of s1 or s2
def Overlap(s1, s2):
    numer = len(s1.intersection(s2))
    denom = min(len(s1), len(s2))
    return numer / denom

In [81]:
# Use original predict rating but with Cosine
def predictRating(user, item):
    ratings = []
    similarities = []
    for d in reviewsPerItem[item]:
        u2 = d['user_id']
        if u2 == user: continue
        ratings.append(d['rating'])
        similarities.append(Overlap(itemsPerUser[user],itemsPerUser[u2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

In [82]:
# Prediction using item-to-item similarity above
cfPredictions = [predictRating(d['user_id'], d['book_id']) for d in sample]

In [83]:
# Item-to-item similarity accuracy
MSE(cfPredictions, sampleLabels)

1.3768558121005727

#### Question 7 (c)

In [62]:
# Now initially subtract the mean rating, so that we are weighting deviations from the mean
def predictRating(user, item):
    ratings = []
    similarities = []
    averages = []
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
        averages.append(sum(d['rating'] for d in reviewsPerItem[i2]) / len(reviewsPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [((x - mean)*y) for x,y,mean in zip(ratings,similarities,averages)]
        avg_item = sum([d['rating'] for d in reviewsPerItem[item]]) / len(reviewsPerItem[item])
        return avg_item + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

In [63]:
# Prediction using item-to-item similarity above and mean
cfPredictions = [predictRating(d['user_id'], d['book_id']) for d in sample]

In [64]:
# Item-to-item similarity accuracy
MSE(cfPredictions, sampleLabels)

0.8225498367471318