In [7]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, '../utils')
import utils

In [8]:
# Function to convert JSON file to DataFrame
def json_to_dataframe(file_path):
    # Initialize an empty list to store the JSON objects
    data = []

    # Open and read the file
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line (which is a valid JSON string) and append to the list
            data.append(json.loads(line))
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    return df

In [9]:

def clean_data(df):
    # Drop rows where rating is missing
    df = df.dropna(subset=['rating'])

    # Convert user_id and item_id to strings
    df['user_id'] = df['user_id'].astype(str)
    df['item_id'] = df['item_id'].astype(str)

    # Convert rating, age, and size to numeric (float or int)
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df['age'] = pd.to_numeric(df['age'], errors='coerce')
    df['size'] = pd.to_numeric(df['size'], errors='coerce')
    df['weight'] = pd.to_numeric(df['weight'].str.replace('lbs', ''), errors='coerce')

    # Convert review_date to datetime
    df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')

    # Add column for review_length
    df['review_length'] = df['review_text'].str.len()

    # Convert height to inches
    def convert_height_to_inches(height):
        if pd.isna(height):
            return None
        try:
            feet, inches = height.split("'")
            inches = inches.replace('"', '') 
            return int(feet) * 12 + int(inches)
        except:
            return None
        
    df['height'] = df['height'].apply(convert_height_to_inches)

    # Function to extract band size and cup size from bust size
    def extract_bust_components(bust_size):
        if pd.isna(bust_size):
            return None, None
        try:
            band_size = int(''.join(filter(str.isdigit, bust_size)))
            cup_size = ''.join(filter(str.isalpha, bust_size)).upper()
            return band_size, cup_size
        except:
            return None, None

    # Apply the extraction function to bust size
    df[['band_size', 'cup_size']] = df.apply(lambda row: extract_bust_components(row['bust size']), axis=1, result_type='expand')

    return df

In [10]:
file_path = '../data/renttherunway_final_data.json'

# Read in original data
df = json_to_dataframe(file_path)

In [1]:
df = clean_data(df)

# Shuffle
df = df.sample(frac=1).reset_index(drop=True)

NameError: name 'clean_data' is not defined

In [12]:
print(df.dtypes)

fit                       object
user_id                   object
bust size                 object
item_id                   object
weight                   float64
rating                   float64
rented for                object
review_text               object
body type                 object
review_summary            object
category                  object
height                   float64
size                       int64
age                      float64
review_date       datetime64[ns]
review_length              int64
band_size                float64
cup_size                  object
dtype: object


In [13]:
# Write df to csv
df.to_csv('../data/renttherunway_cleaned_data.csv', index=False)