In [53]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model
import numpy
import random
import gzip
import math

In [54]:
import numpy as np

In [55]:
import warnings
warnings.filterwarnings("ignore")

In [56]:
def assertFloat(x): # Checks that an answer is a float
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [57]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(json.loads(l))

In [58]:
len(dataset)

10000

In [59]:
answers = {} # Put your answers to each question in this dictionary

In [60]:
dataset[0]

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '2767052',
 'review_id': '248c011811e945eca861b5c31a549291',
 'rating': 5,
 'review_text': "I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama of how that unfolds is alw

In [61]:
import numpy as np
from sklearn.metrics import mean_squared_error

In [62]:
### Question 1
X = np.array([review['review_text'].count('!') for review in dataset]).reshape(-1, 1)
y = np.array([review['rating'] for review in dataset])

model = linear_model.LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
theta0 = model.intercept_
theta1 = model.coef_[0]


In [63]:
answers['Q1'] = [float(theta0), float(theta1), float(mse)]

In [67]:
assertFloatList(answers['Q1'], 3) # Check the format of your answer (three floats)

In [68]:
### Question 2

In [69]:
num_exclamations = np.array([review['review_text'].count('!') for review in dataset]).reshape(-1, 1)
review_length = np.array([len(review['review_text']) for review in dataset]).reshape(-1, 1)

X = np.hstack((review_length, num_exclamations))
y = np.array([review['rating'] for review in dataset])

model = linear_model.LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)

mse = mean_squared_error(y, y_pred)

theta0 = model.intercept_
theta1 = model.coef_[0]
theta2 = model.coef_[1]

In [70]:
answers['Q2'] = [float(theta0), float(theta1), float(theta2), float(mse)]

In [71]:
assertFloatList(answers['Q2'], 4)

In [72]:
from sklearn.preprocessing import PolynomialFeatures

In [76]:
### Question 3
mses = []

X = np.array([review['review_text'].count('!') for review in dataset]).reshape(-1, 1)
y = np.array([review['rating'] for review in dataset])

for degree in range(1, 6):
    poly = PolynomialFeatures(degree)
    X_poly = poly.fit_transform(X)
    
    model = linear_model.LinearRegression()
    model.fit(X_poly, y)
    
    y_pred = model.predict(X_poly)
    
    mse = mean_squared_error(y, y_pred)
    
    mses.append(float(mse))


In [77]:
answers['Q3'] = mses

In [78]:
assertFloatList(answers['Q3'], 5)# List of length 5

In [80]:
### Question 4
X = np.array([review['review_text'].count('!') for review in dataset]).reshape(-1, 1)
y = np.array([review['rating'] for review in dataset])

split_index = len(X) // 2
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

mses = []
for degree in range(1, 6):
    poly = PolynomialFeatures(degree)
    
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    model = linear_model.LinearRegression()
    model.fit(X_train_poly, y_train)
    
    y_pred_test = model.predict(X_test_poly)
    mse = mean_squared_error(y_test, y_pred_test)
    
    mses.append(float(mse))

In [81]:
answers['Q4'] = mses

In [82]:
assertFloatList(answers['Q4'], 5)

In [83]:
from sklearn.metrics import mean_absolute_error

In [84]:
### Question 5
theta_0 = np.median(y_train)
y_pred_test = np.full_like(y_test, theta_0)

mae = mean_absolute_error(y_test, y_pred_test)

In [85]:
answers['Q5'] = float(mae)

In [86]:
assertFloat(answers['Q5'])

In [87]:
### Question 6

In [88]:
f = open("beer_50000.json")
dataset = []
for l in f:
    if 'user/gender' in l:
        dataset.append(eval(l))

In [89]:
len(dataset)

20403

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

X = np.array([review['review/text'].count('!') for review in dataset]).reshape(-1, 1)
y = np.array([1 if review['user/gender'] == 'Female' else 0 for review in dataset])
model = LogisticRegression()
model.fit(X, y)

y_pred = model.predict(X)

TN, FP, FN, TP = confusion_matrix(y, y_pred).ravel()
sens = TP / (TP + FN) 
spec = TN / (TN + FP)
BER = 1 - 0.5 * (sens + spec)

print(f"True Positives: {TP}")
print(f"True Negatives: {TN}")
print(f"False Positives: {FP}")
print(f"False Negatives: {FN}")
print(f"Balanced Error Rate (BER): {BER}")

True Positives: 0
True Negatives: 20095
False Positives: 0
False Negatives: 308
Balanced Error Rate (BER): 0.5


In [91]:
answers['Q6'] = [int(TP), int(TN), int(FP), int(FN), float(BER)]

In [92]:
assertFloatList(answers['Q6'], 5)

In [93]:
### Question 7
X = np.array([review['review/text'].count('!') for review in dataset]).reshape(-1, 1)
y = np.array([1 if review['user/gender'] == 'Female' else 0 for review in dataset])
model = LogisticRegression(class_weight='balanced')
model.fit(X, y)

y_pred = model.predict(X)

TN, FP, FN, TP = confusion_matrix(y, y_pred).ravel()
sens = TP / (TP + FN) 
spec = TN / (TN + FP)
BER = 1 - 0.5 * (sens + spec)

print(f"True Positives: {TP}")
print(f"True Negatives: {TN}")
print(f"False Positives: {FP}")
print(f"False Negatives: {FN}")
print(f"Balanced Error Rate (BER): {BER}")

True Positives: 88
True Negatives: 16332
False Positives: 3763
False Negatives: 220
Balanced Error Rate (BER): 0.4507731134255145


In [94]:
answers["Q7"] = [int(TP), int(TN), int(FP), int(FN), float(BER)]

In [95]:
assertFloatList(answers['Q7'], 5)

In [96]:
### Question 8

In [97]:
def count_exclamations(text):
    return text.count('!')

def precision_at_k(y_true, y_scores, k):
    sorted_indices = np.argsort(y_scores)[::-1]
    top_k_indices = sorted_indices[:k]
    true_positives = np.sum(y_true[top_k_indices] == 1)
    return true_positives / k

def prob_8(data):
    X = np.array([count_exclamations(review['review/text']) for review in data]).reshape(-1, 1)
    y = np.array([1 if review['user/gender'] == 'Female' else 0 for review in data])
    
    model = LogisticRegression(class_weight='balanced')
    model.fit(X, y)
    
    y_scores = model.predict_proba(X)[:, 1]  # Probability of being female
    
    k_values = [1, 10, 100, 1000, 10000]
    precision_values = [precision_at_k(y, y_scores, k) for k in k_values]
    
    return precision_values

print(prob_8(dataset))

[np.float64(0.0), np.float64(0.0), np.float64(0.02), np.float64(0.025), np.float64(0.017)]


In [99]:
X = np.array([review['review/text'].count('!') for review in dataset]).reshape(-1, 1)
y = np.array([1 if review['user/gender'] == 'Female' else 0 for review in dataset])
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X, y)

y_prob = model.predict_proba(X)[:, 1]
sorted_indices = np.argsort(y_prob)[::-1]

K_values = [1, 10, 100, 1000, 10000]

# Calculate precision@K for each value of K
precisionList = []
for K in K_values:
    top_K_indices = sorted_indices[:K]
    y_top_K = y[top_K_indices]

    precision_at_K = np.sum(y_top_K == 1) / K
    precisionList.append(float(precision_at_K))

print(f"Precision@K for K={K_values}: {precisionList}")

Precision@K for K=[1, 10, 100, 1000, 10000]: [0.0, 0.0, 0.02, 0.025, 0.017]


In [100]:
answers['Q8'] = precisionList

In [101]:
assertFloatList(answers['Q8'], 5) #List of five floats

In [102]:
f = open("answers_hw1.txt", 'w') # Write your answers to a file
f.write(str(answers) + '\n')
f.close()