# Reduce Data


In [9]:
import pandas as pd
import matplotlib as plt

# notebook configurations
pd.options.display.max_colwidth = 1000

import warnings
warnings.filterwarnings("ignore")

In [10]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("drive/MyDrive/COGS 109 Amazon Project/Data/amazon_products_raw.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:

df.shape

(396210, 9)

In [12]:
df["main_category"].unique()

array(['appliances', 'car & motorbike', 'tv, audio & cameras',
       'sports & fitness', 'grocery & gourmet foods', 'home & kitchen',
       'pet supplies', 'stores', 'toys & baby products', "kids' fashion",
       'bags & luggage', 'accessories', "women's shoes",
       'beauty & health', "men's shoes", "women's clothing",
       'industrial supplies', "men's clothing", 'music',
       'home, kitchen, pets'], dtype=object)

In [13]:
df["main_category"].value_counts().mean()

19810.5

Given that our dataset is 300K+ rows and is computational complex given our resources, we're emmploying a straitfied random sampling method to extract the respective rows from our data.

In [14]:
# Group the dataset
grouped = df.groupby("main_category")

# Sample size for each group
sample_size = 4_500

sampled_data = pd.DataFrame()

# Iterate over each group, and perform sampling
for group_name, group_df in grouped:
    if len(group_df) >= sample_size:
        # If the group has enough rows, sample the specified number of rows
        sampled_rows = group_df.sample(n=sample_size)
    else:
        # If the group has fewer rows than the desired sample size, sample all rows
        sampled_rows = group_df
    sampled_data = pd.concat([sampled_data, sampled_rows])

In [15]:
sampled_data

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price
39269,Amazon Brand - Solimo Polyester Duffel Gym Bag with Shoe Compartment,accessories,Bags & Luggage,https://m.media-amazon.com/images/W/IMAGERENDERING_521856-T2/images/I/91BdtB6z8uL._AC_UL320_.jpg,https://www.amazon.in/Amazon-Brand-Solimo-so_flingyrqba_4-Gym/dp/B084DPRRRR/ref=sr_1_7348?qid=1679144136&s=luggage&sr=1-7348,4.2,100,₹549,₹900
134976,NAMDEV GEMS 7.25 Ratti / 6.45 Carat Ceylon White Sapphire/Safed Pukhraj 5.25 Ratti Certified Natural Gemstone,accessories,Fashion & Silver Jewellery,https://m.media-amazon.com/images/I/61S2UNRgL1L._AC_UL320_.jpg,https://www.amazon.in/NAMDEV-GEMS-Sapphire-Certified-Gemstone/dp/B07QZC211J/ref=sr_1_16303?qid=1679160512&s=jewelry&sr=1-16303,3.9,3,"₹1,590","₹5,999"
128082,Mahi Valentine Gift Blue Crystals and Artificial Pearls Paisley Necklace Set for Women (NL1103769RBlu),accessories,Fashion & Silver Jewellery,https://m.media-amazon.com/images/I/61+MF9gLe7L._AC_UL320_.jpg,https://www.amazon.in/Mahi-Valentine-Crystals-Artificial-NL1103769RBlu/dp/B08TRHMR2R/ref=sr_1_8741?qid=1679160240&s=jewelry&sr=1-8741,3.6,19,₹300,"₹1,499"
381411,NEUTRON Gift Analog White and Grey Color Dial Girls Watch - G462-(13-L-10) (Pack of 2),accessories,Watches,https://m.media-amazon.com/images/I/71Z5WuoZrdL._AC_UL320_.jpg,https://www.amazon.in/NEUTRON-Analog-White-Color-Girls/dp/B0BMW3PJ8K/ref=sr_1_16147?qid=1679156199&s=watches&sr=1-16147,,,₹286,₹699
168596,"Fashion Girls Fashion Cute Stylish Leather Backpack and Sling Bag Set for Women, School & College Girls/Leather Bagpack Se...",accessories,Handbags & Clutches,https://m.media-amazon.com/images/I/414WeHN8pAL._AC_UL320_.jpg,https://www.amazon.in/Fashion-Stylish-Leather-Backpack-Bag/dp/B0BVZHHP2H/ref=sr_1_12041?qid=1679158994&s=shoes&sr=1-12041,,,,
...,...,...,...,...,...,...,...,...,...
313180,Max Womens Su20wfba2001a Sneakers,women's shoes,Shoes,https://m.media-amazon.com/images/I/41+Wq8kCE8L._AC_UL320_.jpg,https://www.amazon.in/Max-Womens-Black-Sandal-SU20WFBA2001ABLACK/dp/B09H2X7TDG/ref=sr_1_10268?qid=1679211574&s=shoes&sr=1-10268,4.0,1,₹461,₹599
49670,FASHIMO Stylish Bellies for Women's and Girls 723,women's shoes,Ballerinas,https://m.media-amazon.com/images/I/718pK43EgpL._AC_UL320_.jpg,https://www.amazon.in/FASHIMO-Stylish-Bellies-Womens-Girls/dp/B08WCDNTJR/ref=sr_1_2429?qid=1679211851&s=shoes&sr=1-2429,4.3,9,₹599,₹999
313014,Khadim's Women Black Casual Mule Sandal,women's shoes,Shoes,https://m.media-amazon.com/images/I/51h4MhuHWfL._AC_UL320_.jpg,https://www.amazon.in/Khadims-Women-Black-Casual-Sandal/dp/B0BDVBSG58/ref=sr_1_7923?qid=1679211558&s=shoes&sr=1-7923,,,₹685,₹699
312970,Brauch Women's Stylish Suede Patterned Flats,women's shoes,Shoes,https://m.media-amazon.com/images/I/71R+K5Joj5L._AC_UL320_.jpg,https://www.amazon.in/Brauch-Womens-Black-Stylish-Patterned/dp/B08MYTPWX9/ref=sr_1_7540?qid=1679211554&s=shoes&sr=1-7540,4.1,31,₹299,₹599


In [16]:
sampled_data.to_csv("drive/MyDrive/COGS 109 Amazon Project/Data/amazon_products_sampled_raw.csv", index = False)