### Imports

In [2]:
import pandas as pd
import csv
import os
import gzip

## 1. Describe the Dataset

Dataset download link: https://cseweb.ucsd.edu/~jmcauley/datasets.html#multi_aspect

In [3]:
def json_to_csv(input_file, output_file, line_limit=None):
    """
    Converts a JSON-like file (or a .gz file containing JSON-like data) with dictionaries on each line into a CSV file.

    Args:
        input_file (str): Path to the input JSON-like or .gz file.
        output_file (str): Path to the output CSV file.
        line_limit (int, optional): Maximum number of lines to process. Default is None (process all lines).
    """
    # Check if the output CSV file already exists
    if os.path.exists(output_file):
        print(f"CSV file '{output_file}' already exists. Skipping conversion.")
        return

    data = []
    try:
        # Open the input file
        open_file = gzip.open if input_file.endswith('.gz') else open

        with open_file(input_file, 'rt', encoding='utf-8') as file:
            for i, line in enumerate(file):
                if line_limit and i >= line_limit:  # Stop after reaching the line limit
                    break
                try:
                    line = line.strip().rstrip(',')  # Remove whitespace and trailing commas
                    entry = eval(line)  # Safely parse the line as a dictionary
                    data.append(entry)  # Add the dictionary to the data list
                except Exception as e:
                    print(f"Error parsing line {i+1}: {line} -> {e}")

        # Write to CSV if data is successfully parsed
        if data:
            with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
                writer = csv.writer(csv_file)

                # Write header
                header = data[0].keys()
                writer.writerow(header)

                # Write rows
                for row in data:
                    writer.writerow(row.values())

            print(f"Data successfully written to {output_file}.")
        else:
            print("No valid data found to write to CSV.")
    except Exception as e:
        print(f"Error processing file: {e}")

In [None]:
input_file = 'ratebeer.json.gz'
output_file = 'ratebeer_20k.csv'
line_limit = 20000

json_to_csv(input_file, output_file, line_limit)

Data successfully written to ratebeer_20k.csv.


If we convert the entire dataset into a csv file, the resulting file will be too large, thus we have decide to only use the first 20,000 samples for our project.

### Loading the dataset

In [6]:
df = pd.read_csv('ratebeer_20k.csv')
df.head()

Unnamed: 0,beer/name,beer/beerId,beer/brewerId,beer/ABV,beer/style,review/appearance,review/aroma,review/palate,review/taste,review/overall,review/time,review/profileName,review/text
0,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale &#40;IPA&#41;,4/5,6/10,3/5,6/10,13/20,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."
1,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale &#40;IPA&#41;,4/5,6/10,4/5,7/10,13/20,1157241600,TomDecapolis,On tap at the John Harvards in Springfield PA....
2,John Harvards Cristal Pilsner,71716,8481,5.0,Bohemian Pilsener,4/5,5/10,3/5,6/10,14/20,958694400,PhillyBeer2112,"UPDATED: FEB 19, 2003 Springfield, PA. I've ne..."
3,John Harvards Fancy Lawnmower Beer,64125,8481,5.4,Klsch,2/5,4/10,2/5,4/10,8/20,1157587200,TomDecapolis,On tap the Springfield PA location billed as t...
4,John Harvards Fancy Lawnmower Beer,64125,8481,5.4,Klsch,2/5,4/10,2/5,4/10,8/20,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   beer/name           20000 non-null  object
 1   beer/beerId         20000 non-null  int64 
 2   beer/brewerId       20000 non-null  int64 
 3   beer/ABV            20000 non-null  object
 4   beer/style          20000 non-null  object
 5   review/appearance   20000 non-null  object
 6   review/aroma        20000 non-null  object
 7   review/palate       20000 non-null  object
 8   review/taste        20000 non-null  object
 9   review/overall      20000 non-null  object
 10  review/time         20000 non-null  int64 
 11  review/profileName  20000 non-null  object
 12  review/text         19953 non-null  object
dtypes: int64(3), object(10)
memory usage: 2.0+ MB


In [8]:
df.describe()

Unnamed: 0,beer/beerId,beer/brewerId,review/time
count,20000.0,20000.0,20000.0
mean,34339.39055,2248.6595,1197787000.0
std,41020.66295,2667.682527,82948430.0
min,51.0,12.0,956707200.0
25%,53.0,12.0,1143590000.0
50%,14228.0,1179.0,1206230000.0
75%,63820.0,3324.0,1264378000.0
max,162988.0,13519.0,1326326000.0


In [9]:
df.isnull().sum()

beer/name              0
beer/beerId            0
beer/brewerId          0
beer/ABV               0
beer/style             0
review/appearance      0
review/aroma           0
review/palate          0
review/taste           0
review/overall         0
review/time            0
review/profileName     0
review/text           47
dtype: int64

## 2. Identify a Predictive Task on our Dataset

For this dataset, our group decide to make a predictor for the overall rating of a beer, based on the text review of the users.
