In [11]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.insert(0, '../utils')
import utils

In [12]:
# Function to convert JSON file to DataFrame
def json_to_dataframe(file_path):
    # Initialize an empty list to store the JSON objects
    data = []

    # Open and read the file
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line (which is a valid JSON string) and append to the list
            data.append(json.loads(line))
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    return df

In [13]:

def clean_data(df):
    # Drop rows where rating is missing
    df = df.dropna(subset=['rating'])

    # Convert user_id and item_id to strings
    df['user_id'] = df['user_id'].astype(str)
    df['item_id'] = df['item_id'].astype(str)

    # Convert rating, age, and size to numeric (float or int)
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df['age'] = pd.to_numeric(df['age'], errors='coerce')
    df['size'] = pd.to_numeric(df['size'], errors='coerce')
    df['weight'] = pd.to_numeric(df['weight'].str.replace('lbs', ''), errors='coerce')

    # Convert review_date to datetime
    df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce')

    # Add column for review_length
    df['review_length'] = df['review_text'].str.len()

    # Convert height to inches
    def convert_height_to_inches(height):
        if pd.isna(height):
            return None
        try:
            feet, inches = height.split("'")
            inches = inches.replace('"', '') 
            return int(feet) * 12 + int(inches)
        except:
            return None
        
    df['height'] = df['height'].apply(convert_height_to_inches)

    # Function to extract band size and cup size from bust size
    def extract_bust_components(bust_size):
        if pd.isna(bust_size):
            return None, None
        try:
            band_size = int(''.join(filter(str.isdigit, bust_size)))
            cup_size = ''.join(filter(str.isalpha, bust_size)).upper()
            return band_size, cup_size
        except:
            return None, None

    # Apply the extraction function to bust size
    df[['band_size', 'cup_size']] = df.apply(lambda row: extract_bust_components(row['bust size']), axis=1, result_type='expand')

    return df

In [14]:
file_path = '../data/renttherunway_final_data.json'

# Read in original data
df = json_to_dataframe(file_path)

In [15]:
df = clean_data(df)

# Shuffle
df = df.sample(frac=1).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_id'] = df['user_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['item_id'] = df['item_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataF

In [16]:
# Create popularity column, which is the number of times an item has been rented
df['item_rent_count'] = df.groupby('item_id')['item_id'].transform('count')

print(df.head(5))

   fit user_id bust size  item_id  weight  rating     rented for   
0  fit  875044       32c  2195317   118.0      10           work  \
1  fit   78963       34d  1266469   140.0      10  formal affair   
2  fit  460284       34b   682043   138.0       8          party   
3  fit  717584       36d  1730006   150.0       8          party   
4  fit  269726       36c  1904556   138.0      10       everyday   

                                         review_text          body type   
0  Fit well and looked good with jeans and riding...             petite  \
1  This was a beautiful dress that I rented it to...          full bust   
2  At 5"11", I was worried that this dress would ...  straight & narrow   
3  Very flattering, got tons of compliments. The ...          hourglass   
4  Perfect fit 5'7" . The dress was so pretty I r...           athletic   

                                      review_summary category  height  size   
0  A chic look that kept the Scottish chill at ba...  sweater

In [17]:
print(df.dtypes)

fit                        object
user_id                    object
bust size                  object
item_id                    object
weight                    float64
rating                      int64
rented for                 object
review_text                object
body type                  object
review_summary             object
category                   object
height                    float64
size                        int64
age                       float64
review_date        datetime64[ns]
review_length               int64
band_size                 float64
cup_size                   object
item_rent_count             int64
dtype: object


In [18]:
# Write df to csv
df.to_csv('../data/renttherunway_cleaned_data.csv', index=False)