### Imports

In [1]:
import pandas as pd
import csv
import os
import gzip

## 1. Describe the Dataset

Dataset download link: https://cseweb.ucsd.edu/~jmcauley/datasets.html#multi_aspect

In [2]:
def json_to_csv(input_file, output_file, line_start, line_end):
    """
    Converts a JSON-like file (or a .gz file containing JSON-like data) with dictionaries on each line into a CSV file.

    Args:
        input_file (str): Path to the input JSON-like or .gz file.
        output_file (str): Path to the output CSV file.
        line_limit (int, optional): Maximum number of lines to process. Default is None (process all lines).
    """
    # Check if the output CSV file already exists
    if os.path.exists(output_file):
        print(f"CSV file '{output_file}' already exists. Skipping conversion.")
        return

    data = []
    try:
        # Open the input file
        open_file = gzip.open if input_file.endswith('.gz') else open

        with open_file(input_file, 'rt', encoding='utf-8') as file:
            for i, line in enumerate(file):
                if i < line_start:
                    continue
                if i >= line_end:  # Stop after reaching the line limit
                    break
                try:
                    line = line.strip().rstrip(',')  # Remove whitespace and trailing commas
                    entry = eval(line)  # Safely parse the line as a dictionary
                    data.append(entry)  # Add the dictionary to the data list
                except Exception as e:
                    print(f"Error parsing line {i+1}: {line} -> {e}")

        # Write to CSV if data is successfully parsed
        if data:
            with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
                writer = csv.writer(csv_file)

                # Write header
                header = data[0].keys()
                writer.writerow(header)

                # Write rows
                for row in data:
                    writer.writerow(row.values())

            print(f"Data successfully written to {output_file}.")
        else:
            print("No valid data found to write to CSV.")
    except Exception as e:
        print(f"Error processing file: {e}")

In [3]:
input_file = 'ratebeer.json'
output_file_train = 'ratebeer_train.csv'
output_file_validation = 'ratebeer_validation.csv'
output_file_test = 'ratebeer_test.csv'

json_to_csv(input_file, output_file_train, 0, 180000)
json_to_csv(input_file, output_file_validation, 180000, 200000)
json_to_csv(input_file, output_file_test, 200000, 220000)

CSV file 'ratebeer_train.csv' already exists. Skipping conversion.
CSV file 'ratebeer_validation.csv' already exists. Skipping conversion.
CSV file 'ratebeer_test.csv' already exists. Skipping conversion.


If we convert the entire dataset into a csv file, the resulting file will be too large, thus we have decide to only use the first 20,000 samples for our project.

### Loading the dataset

In [4]:
df = pd.read_csv('ratebeer_train.csv')
df.head()

  df = pd.read_csv('ratebeer_train.csv')


Unnamed: 0,beer/name,beer/beerId,beer/brewerId,beer/ABV,beer/style,review/appearance,review/aroma,review/palate,review/taste,review/overall,review/time,review/profileName,review/text
0,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale &#40;IPA&#41;,4/5,6/10,3/5,6/10,13/20,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."
1,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale &#40;IPA&#41;,4/5,6/10,4/5,7/10,13/20,1157241600,TomDecapolis,On tap at the John Harvards in Springfield PA....
2,John Harvards Cristal Pilsner,71716,8481,5.0,Bohemian Pilsener,4/5,5/10,3/5,6/10,14/20,958694400,PhillyBeer2112,"UPDATED: FEB 19, 2003 Springfield, PA. I've ne..."
3,John Harvards Fancy Lawnmower Beer,64125,8481,5.4,Klsch,2/5,4/10,2/5,4/10,8/20,1157587200,TomDecapolis,On tap the Springfield PA location billed as t...
4,John Harvards Fancy Lawnmower Beer,64125,8481,5.4,Klsch,2/5,4/10,2/5,4/10,8/20,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   beer/name           180000 non-null  object
 1   beer/beerId         180000 non-null  object
 2   beer/brewerId       180000 non-null  int64 
 3   beer/ABV            180000 non-null  object
 4   beer/style          180000 non-null  object
 5   review/appearance   180000 non-null  object
 6   review/aroma        180000 non-null  object
 7   review/palate       180000 non-null  object
 8   review/taste        180000 non-null  object
 9   review/overall      180000 non-null  object
 10  review/time         180000 non-null  int64 
 11  review/profileName  180000 non-null  object
 12  review/text         179804 non-null  object
dtypes: int64(2), object(11)
memory usage: 17.9+ MB


In [6]:
df.describe()

Unnamed: 0,beer/brewerId,review/time
count,180000.0,180000.0
mean,2128.656489,1209006000.0
std,2932.221422,80352520.0
min,1.0,956448000.0
25%,395.0,1155341000.0
50%,1158.0,1217808000.0
75%,2554.0,1276387000.0
max,13874.0,1326413000.0


In [7]:
df.isnull().sum()

beer/name               0
beer/beerId             0
beer/brewerId           0
beer/ABV                0
beer/style              0
review/appearance       0
review/aroma            0
review/palate           0
review/taste            0
review/overall          0
review/time             0
review/profileName      0
review/text           196
dtype: int64

In [8]:
num_beers = df['beer/beerId'].nunique()
print("No. of unique beers in train data: " + str(num_beers))

avg_reviews_per_beer = df.groupby('beer/beerId').size().mean()
print("Average no. of reviews per beer: " + str(avg_reviews_per_beer))

num_users = df['review/profileName'].nunique()
print("No. of unique users: " + str(num_users))

avg_reviews_per_user = df.groupby('review/profileName').size().mean()
print("Average no. of reviews per user: " + str(avg_reviews_per_user))

num_brewers = df['beer/brewerId'].nunique()
print("No. of unique breweries: " + str(num_brewers))

avg_reviews_per_brewer = df.groupby('beer/brewerId').size().mean()
print("Average no. of reviews per brewer: " + str(avg_reviews_per_brewer))

shape = df.shape

print(f"No. of rows: {shape[0]}")
print(f"No. of cols: {shape[1]}")

No. of unique beers in train data: 7694
Average no. of reviews per beer: 23.394853132310892
No. of unique users: 9224
Average no. of reviews per user: 19.514310494362533
No. of unique breweries: 545
Average no. of reviews per brewer: 330.27522935779814
No. of rows: 180000
No. of cols: 13


## 2. Identify a Predictive Task on our Dataset

For this dataset, our group decide to make a predictor for the overall rating of a beer, based on the text review of the users.


In [None]:


from sklearn import linear_model
from textblob import TextBlob
import numpy as np
from csv import DictReader

# Load training data
trainData = []
with open('ratebeer_train.csv', 'r') as f:
    dict_reader = DictReader(f)
    trainData = list(dict_reader)

# Use library to get sentiment
def feature(datum):
    blob = TextBlob(datum['review/text'])
    return blob.sentiment.polarity

# Convert 'X/20' format to float
def floatRating(datum):
    return (float(datum['review/overall'].split('/')[0])*.05)

# Train linear model
polarities = [feature(d) for d in trainData]
ratings = [floatRating(d) for d in trainData]

X = np.array([[1,p] for p in polarities])
Y = np.array(ratings).T

model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X, Y)

y_pred = np.array(model.predict(X))
sse = sum([x**2 for x in (Y - y_pred)])
mse = sse / len(Y)

In [None]:
# Load validation set
validationData = []
with open('ratebeer_validation.csv', 'r') as f:
    dict_reader = DictReader(f)
    validationData = list(dict_reader)

polarities = [feature(d) for d in validationData]
ratings = [floatRating(d) for d in validationData]

X = np.array([[1,p] for p in polarities])
Y = np.array(ratings).T

#Round predicition to nearest X/20
# def roundPrediction(p):
#     return round(p * 20) / 20

y_pred = np.array(model.predict(X))
sse = sum([x**2 for x in (Y - y_pred)])
mse = sse / len(Y)
print(mse)

print(y_pred[:5])
print(Y[:5])

0.015543805789271206
[0.64719316 0.76859969 0.6828643  0.70820972 0.68544715]
[0.65 0.75 0.55 0.65 0.85]
