In [1]:
from pathlib import Path

# where the notebook file is located
project = Path.cwd()
print("Notebook working directory:", project)

# find the data_raw folder (it may be one level up)
data_raw = project.parent / "data_raw" if (project.parent / "data_raw").exists() else project / "data_raw"
print("Looking for data in:", data_raw, "\n")

files = sorted(list(data_raw.glob("*.csv")))
if not files:
    print("No CSV files found in data_raw.")
else:
    print("CSV files found (name - size in MB):")
    for i, f in enumerate(files, start=1):
        print(f"{i}. {f.name} - {round(f.stat().st_size/1024**2, 2)} MB")

# return the list object so VS Code shows it
files


Notebook working directory: c:\Users\DELL\Desktop\amazon_project\notebooks
Looking for data in: c:\Users\DELL\Desktop\amazon_project\data_raw 

CSV files found (name - size in MB):
1. amazon_india_2015.csv - 8.7 MB
2. amazon_india_2016.csv - 14.5 MB
3. amazon_india_2017.csv - 20.23 MB
4. amazon_india_2018.csv - 26.01 MB
5. amazon_india_2019.csv - 31.83 MB
6. amazon_india_2020.csv - 37.66 MB
7. amazon_india_2021.csv - 36.15 MB
8. amazon_india_2022.csv - 34.67 MB
9. amazon_india_2023.csv - 33.18 MB
10. amazon_india_2024.csv - 31.73 MB
11. amazon_india_2025.csv - 20.17 MB


[WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2015.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2016.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2017.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2018.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2019.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2020.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2021.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2022.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2023.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2024.csv'),
 WindowsPath('c:/Users/DELL/Desktop/amazon_project/data_raw/amazon_india_2025.csv')]

In [2]:
import pandas as pd

# choose the first file (2015)
file_2015 = data_raw / "amazon_india_2015.csv"

print("Loading sample rows from:", file_2015)

# load only first 10,000 rows (safe & fast)
df = pd.read_csv(file_2015, nrows=10000, low_memory=False)

df.shape


Loading sample rows from: c:\Users\DELL\Desktop\amazon_project\data_raw\amazon_india_2015.csv


(10000, 34)

In [3]:
df.head()


Unnamed: 0,transaction_id,order_date,customer_id,product_id,product_name,category,subcategory,brand,original_price_inr,discount_percent,...,is_festival_sale,festival_name,customer_rating,return_status,order_month,order_year,order_quarter,product_weight_kg,is_prime_eligible,product_rating
0,TXN_2015_00000001,2015-01-25,CUST_2015_00003884,PROD_000021,Samsung Galaxy S6 16GB Black,Electronics,Smartphones,Samsung,123614.29,27.91,...,True,Republic Day Sale,5.0,Delivered,1,2015,1,0.19,True,4.7
1,TXN_2015_00000002,2015-01-05,CUST_2015_00011709,PROD_000055,OnePlus OnePlus 2 16GB White,Electronics,Smartphones,OnePlus,54731.86,0.0,...,False,,4.5,Delivered,1,2015,1,0.2,True,4.1
2,TXN_2015_00000003,2015-01-24,CUST_2015_00004782,PROD_000039,Samsung Galaxy Note 5 64GB Black,Electronics,Smartphones,Samsung,97644.25,46.93,...,True,Republic Day Sale,,Delivered,1,2015,1,0.17,True,3.3
3,TXN_2015_00000004,2015-01-28,CUST_2015_00008105,PROD_000085,Motorola Moto G (3rd Gen) 16GB Black,Electronics,Smartphones,Motorola,21947.26,0.0,...,False,,3.0,Delivered,1,2015,1,0.22,True,3.5
4,TXN_2015_00000005,2015-01-31,CUST_2015_00002955,PROD_000055,OnePlus OnePlus 2 16GB White,Electronics,Smartphones,OnePlus,54731.86,0.0,...,False,,4.0,Delivered,1,2015,1,0.2,True,4.1


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 34 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   transaction_id          10000 non-null  object 
 1   order_date              10000 non-null  object 
 2   customer_id             10000 non-null  object 
 3   product_id              10000 non-null  object 
 4   product_name            10000 non-null  object 
 5   category                10000 non-null  object 
 6   subcategory             10000 non-null  object 
 7   brand                   10000 non-null  object 
 8   original_price_inr      10000 non-null  object 
 9   discount_percent        10000 non-null  float64
 10  discounted_price_inr    10000 non-null  float64
 11  quantity                10000 non-null  int64  
 12  subtotal_inr            10000 non-null  float64
 13  delivery_charges        9171 non-null   float64
 14  final_amount_inr        10000 non-null 

In [5]:
df.isna().sum().sort_values(ascending=False).head(15)


festival_name           7571
customer_rating         3018
customer_age_group      1225
delivery_charges         829
order_date                 0
transaction_id             0
subcategory                0
customer_id                0
product_name               0
product_id                 0
discount_percent           0
discounted_price_inr       0
quantity                   0
subtotal_inr               0
final_amount_inr           0
dtype: int64