In [138]:
import numpy
import urllib
import scipy.optimize
import random
from sklearn import linear_model
import gzip
from collections import defaultdict

In [139]:
import warnings
warnings.filterwarnings("ignore")

In [140]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [141]:
f = open("5year.arff", 'r')

In [142]:
# Read and parse the data
while not '@data' in f.readline():
    pass

dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)

In [143]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [144]:
answers = {} # Your answers

In [145]:
import numpy as np

In [146]:
def accuracy(predictions, y):
    correct = np.sum(predictions == y)
    total = len(y)
    return correct / total


In [147]:
def BER(predictions, y):
    classes = np.unique(y)
    error_rates = []
    
    for cls in classes:
        idx = (y == cls)
        true_positive = np.sum((predictions == cls) & (y == cls))
        
        total_class = np.sum(idx)

        error_rate = 1 - (true_positive / total_class)
        error_rates.append(error_rate)
    
    return np.mean(error_rates)

In [148]:
### Question 1

In [149]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X,y)

pred = mod.predict(X)

In [150]:
acc1 = accuracy(pred, y)
ber1 = BER(pred, y)

In [151]:
answers['Q1'] = [float(acc1), float(ber1)] # Accuracy and balanced error rate

In [152]:
assertFloatList(answers['Q1'], 2)

In [153]:
### Question 2

In [154]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(X,y)

pred = mod.predict(X)

In [155]:
acc2 = accuracy(pred, y)
ber2 = BER(pred, y)

In [156]:
answers['Q2'] = [float(acc2), float(ber2)]

In [157]:
assertFloatList(answers['Q2'], 2)

In [158]:
### Question 3

In [159]:
random.seed(3)
random.shuffle(dataset)

In [160]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [161]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [162]:
len(Xtrain), len(Xvalid), len(Xtest)

(1515, 758, 758)

In [163]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(Xtrain,ytrain)

In [164]:
predTrain = mod.predict(Xtrain)
predValid = mod.predict(Xvalid)
predTest = mod.predict(Xtest)

In [165]:
berTrain = BER(predTrain, ytrain)
berValid = BER(predValid, yvalid)
berTest = BER(predTest, ytest)

In [166]:
answers['Q3'] = [float(berTrain), float(berValid), float(berTest)]

In [167]:
assertFloatList(answers['Q3'], 3)

In [168]:
answers['Q3']

[0.29287226079549855, 0.31592039800995025, 0.2585616438356164]

In [169]:
### Question 4

In [170]:
cList = [10**-4, 10**-3, 0.01, 0.1, 1, 10, 100, 1000, 10000]
berList = []
for c in cList:
    mod = linear_model.LogisticRegression(C=c, class_weight='balanced')
    mod.fit(Xtrain,ytrain)
    predValid = mod.predict(Xvalid)
    berValid = BER(predValid, yvalid)
    berList.append(float(berValid))

In [171]:
answers['Q4'] = berList

In [172]:
answers['Q4']

[0.3281320669380371,
 0.3193125282677522,
 0.3328810492989598,
 0.3179556761646314,
 0.31592039800995025,
 0.3111714156490276,
 0.29550300445822836,
 0.29618143050978873,
 0.29618143050978873]

In [173]:
assertFloatList(answers['Q4'], 9)

In [174]:
### Question 5

In [175]:
bestC = 100

In [176]:
mod = linear_model.LogisticRegression(C=bestC, class_weight='balanced')
mod.fit(Xtrain,ytrain)
predTest = mod.predict(Xtest)
ber5 = float(BER(predTest, ytest))

In [177]:
answers['Q5'] = [bestC, ber5]

In [178]:
answers['Q5']

[100, 0.26267123287671235]

In [179]:
assertFloatList(answers['Q5'], 2)

In [180]:
### Question 6

In [206]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(eval(l))

In [207]:
dataTrain = dataset[:9000]
dataTest = dataset[9000:]

In [208]:
# Some data structures you might want

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataTrain:
    user = d['user_id']
    item = d['book_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user, item)] = d['rating']

In [209]:
def Jaccard(s1, s2):
    intersection = len(s1.intersection(s2))
    union = len(s1.union(s2))
    if union == 0:
        return 0
    return intersection / union


In [210]:
def mostSimilar(i, N):
    similarities = []
    users_for_item_i = usersPerItem[i]
    
    for other_item in usersPerItem:
        if other_item == i:
            continue
        users_for_other_item = usersPerItem[other_item]
        similarity = Jaccard(users_for_item_i, users_for_other_item)
        similarities.append((similarity, other_item))
    
    similarities.sort(reverse=True, key=lambda x: x[0])
    return similarities[:N]


In [211]:
similar_items = mostSimilar('2767052', 10)
for sim, item in similar_items:
    print(f"Item ID: {item}, Similarity: {sim}")


Item ID: 6148028, Similarity: 0.4125
Item ID: 7260188, Similarity: 0.3411764705882353
Item ID: 256683, Similarity: 0.1590909090909091
Item ID: 1162543, Similarity: 0.1375
Item ID: 11735983, Similarity: 0.11494252873563218
Item ID: 13335037, Similarity: 0.10989010989010989
Item ID: 28187, Similarity: 0.10810810810810811
Item ID: 428263, Similarity: 0.10666666666666667
Item ID: 49041, Similarity: 0.09876543209876543
Item ID: 41865, Similarity: 0.09782608695652174


In [212]:
similar_items

[(0.4125, '6148028'),
 (0.3411764705882353, '7260188'),
 (0.1590909090909091, '256683'),
 (0.1375, '1162543'),
 (0.11494252873563218, '11735983'),
 (0.10989010989010989, '13335037'),
 (0.10810810810810811, '28187'),
 (0.10666666666666667, '428263'),
 (0.09876543209876543, '49041'),
 (0.09782608695652174, '41865')]

In [213]:
answers['Q6'] = mostSimilar('2767052', 10)

In [214]:
assert len(answers['Q6']) == 10
assertFloatList([x[0] for x in answers['Q6']], 10)

In [215]:
### Question 7

In [220]:
avgRatingPerItem = {}

for item in usersPerItem:
    ratings = [ratingDict[(user, item)] for user in usersPerItem[item]]
    avgRatingPerItem[item] = sum(ratings) / len(ratings) if ratings else 0

res = 0
for v in avgRatingPerItem.values():
    res += v

avgGlobal = res / len(avgRatingPerItem)

In [221]:
def predictRating(user, item):
    numerator = 0
    denominator = 0
    if item in avgRatingPerItem:
        avg_item_rating = avgRatingPerItem[item]
    else:
        avg_item_rating = avgGlobal

    for other_item in itemsPerUser[user]:
        if other_item == item:
            continue
        
        # Similarity between the current item and other items the user has rated
        similarity = Jaccard(usersPerItem[item], usersPerItem[other_item])
        
        if similarity > 0:
            other_item_rating = ratingDict[(user, other_item)]
            avg_other_item_rating = avgRatingPerItem[other_item]
            
            numerator += (other_item_rating - avg_other_item_rating) * similarity
            denominator += similarity
    
    if denominator == 0:
        return avg_item_rating  # If no similar items, return the average rating for the item
    
    return avg_item_rating + (numerator / denominator)


In [222]:
from sklearn.metrics import mean_squared_error

def computeMSE():
    true_ratings = []
    predicted_ratings = []
    
    for d in dataTest:
        user = d['user_id']
        item = d['book_id']
        true_rating = d['rating']
        
        # Predict the rating
        predicted_rating = predictRating(user, item)
        
        true_ratings.append(true_rating)
        predicted_ratings.append(predicted_rating)
    
    # Calculate the MSE
    mse = mean_squared_error(true_ratings, predicted_ratings)
    return mse


In [223]:
mse7 = float(computeMSE())
mse7

1.4368076357019526

In [224]:
answers['Q7'] = mse7

In [225]:
assertFloat(answers['Q7'])

In [226]:
answers['Q7']

1.4368076357019526

In [227]:
### Question 8

In [230]:
def predictRatingUserBased(user, item):
    numerator = 0
    denominator = 0
    if item in avgRatingPerItem:
        avg_item_rating = avgRatingPerItem[item]
    else:
        avg_item_rating = avgGlobal
    
    for other_user in usersPerItem[item]:
        if other_user == user:
            continue
        
        # Similarity between the current user and other users who rated the item
        similarity = Jaccard(itemsPerUser[user], itemsPerUser[other_user])
        
        if similarity > 0:
            other_user_rating = ratingDict[(other_user, item)]
            
            numerator += (other_user_rating - avg_item_rating) * similarity
            denominator += similarity
    
    if denominator == 0:
        return avg_item_rating  # If no similar users, return the average rating for the item
    
    return avg_item_rating + (numerator / denominator)


In [231]:
def computeMSEUserBased():
    true_ratings = []
    predicted_ratings = []
    
    for d in dataTest:
        user = d['user_id']
        item = d['book_id']
        true_rating = d['rating']
        
        # Predict the rating using user-based similarity
        predicted_rating = predictRatingUserBased(user, item)
        
        true_ratings.append(true_rating)
        predicted_ratings.append(predicted_rating)
    
    # Calculate the MSE
    mse = mean_squared_error(true_ratings, predicted_ratings)
    return mse


In [232]:
mse8 = float(computeMSEUserBased())

In [233]:
mse8

1.430054287648074

In [234]:
answers['Q8'] = mse8

In [235]:
assertFloat(answers['Q8'])

In [236]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()