In [10]:
import pandas as pd

# Load item datasets
df1 = pd.read_csv('Items/Items/CDs_and_Vinyls_items_processed.csv')
df2 = pd.read_csv('Items/Items/Digital_Music_items_processed.csv')
df3 = pd.read_csv('Items/Items/Magazine_Subscriptions_items_processed.csv')
df4 = pd.read_csv('Items/Items/Movies_and_TV_items_processed.csv')
df5 = pd.read_csv('Items/Items/Musical_Instruments_items_processed.csv')
df6 = pd.read_csv('Items/Items/Sports_and_Outdoors_items_processed.csv', low_memory=False)
df7 = pd.read_csv('Items/Items/Toys_and_Games_items_processed.csv')
df8 = pd.read_csv('Items/Items/Video_Games_items_processed.csv')

# Update 'Main Category' column
df1['main_category'] = 'CDs and Vinyls'
df2['main_category'] = 'Digital Music'
df3['main_category'] = 'Magazine Subscriptions'
df4['main_category'] = 'Movies and TV'
df5['main_category'] = 'Musical Instruments'
df6['main_category'] = 'Sports and Outdoors'
df7['main_category'] = 'Toys and Games'
df8['main_category'] = 'Video Games'

# Combine item datasets
items_df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8], ignore_index=True)

# Handle missing values
items_df = items_df.dropna()

# Drop duplicates
items_df = items_df.drop_duplicates()

# Ensure correct data types
items_df['item_average_rating'] = items_df['item_average_rating'].astype(float)
items_df['item_rating_number'] = items_df['item_rating_number'].astype(int)
items_df['price'] = pd.to_numeric(items_df['price'], errors='coerce')
items_df['price'] = items_df['price'].astype(float)

# Drop unnecessary columns
items_df = items_df.drop(columns=['description', 'store', 'details', 'features','item_average_rating', 'item_rating_number'])

# Check validity of data
items_df = items_df[items_df['price'] > 0]



In [11]:
# Load review datasets in a similar manner
reviews_df1 = pd.read_csv('Reviews/Reviews/CDs_and_Vinyls_reviews_processed.csv')
reviews_df2 = pd.read_csv('Reviews/Reviews/Digital_Music_reviews_processed.csv')
reviews_df3 = pd.read_csv('Reviews/Reviews/Magazine_Subscriptions_reviews_processed.csv')
reviews_df4 = pd.read_csv('Reviews/Reviews/Movies_and_TV__reviews_processed.csv')
reviews_df5 = pd.read_csv('Reviews/Reviews/Musical_Instruments_reviews_processed.csv')
reviews_df6 = pd.read_csv('Reviews/Reviews/Sports_and_Outdoors_reviews_processed.csv', low_memory=False)
reviews_df7 = pd.read_csv('Reviews/Reviews/Toys_and_Games_reviews_processed.csv')
reviews_df8 = pd.read_csv('Reviews/Reviews/Video_Games_reviews_processed.csv')


# Combine review datasets
reviews_df = pd.concat([reviews_df1, reviews_df2, reviews_df3, reviews_df4, reviews_df5, reviews_df6, reviews_df7, reviews_df8], ignore_index=True)

# Check for missing values
missing_values = reviews_df.isnull().sum()
print("Missing values:\n", missing_values)

#drop any missing and incorrect value
reviews_df = reviews_df.dropna()
#rating must be between 0 and 5
reviews_df = reviews_df[reviews_df['rating'] <= 5]
reviews_df = reviews_df[reviews_df['rating'] >= 0]
#review title must be string and not empty
reviews_df = reviews_df[reviews_df['review_title'].apply(lambda x: isinstance(x, str))]
reviews_df = reviews_df[reviews_df['review_title'].apply(lambda x: len(x) > 0)]

# Check for and remove duplicates
reviews_df = reviews_df.drop_duplicates()

# Ensure correct data types
reviews_df['rating'] = reviews_df['rating'].astype(float)
reviews_df['helpful_vote'] = reviews_df['helpful_vote'].astype(int)
reviews_df['verified_purchase'] = reviews_df['verified_purchase'].astype(bool)
reviews_df['review_title'] = reviews_df['review_title'].astype(str)

# Convert date to datetime format
reviews_df['date'] = pd.to_datetime(reviews_df['date'])

# Drop unnecessary columns
reviews_df = reviews_df.drop(columns=['asin','review_text','main_category'])


Missing values:
 rating                 0
review_title         285
review_text          357
asin                   0
parent_asin            0
user_id                0
date                   0
helpful_vote           0
verified_purchase      0
main_category          0
dtype: int64


In [12]:
items_df.to_csv('items_processed.csv', index=False)

reviews_df.to_csv('reviews_processed.csv', index=False)

In [13]:
import pandas as pd
#merge the two datasets
merged_df = pd.merge(reviews_df,items_df, on='parent_asin', how='left')

merged_df = merged_df.dropna()

merged_df.to_csv('merged_data.csv', index=False)