In [71]:
import pandas as pd
import numpy as np

# load csv file into DataFrame
df = pd.read_csv("sales_v2.csv")
df

Unnamed: 0,customer_id,customer_name,region,sales,unit_price,date
0,101,Alice,North,10.0,25.5,2025-09-01
1,102,Bob,South,5.0,15.0,2025-09-02
2,103,Charlie,East,,20.0,2025-09-03
3,104,David,West,12.0,,2025-09-04
4,105,Eva,North,7.0,18.0,2025-09-05
5,106,Frank,South,8.0,22.0,2025-09-06
6,107,Grace,East,6.0,19.5,2025-09-07
7,108,Helen,West,,,2025-09-08
8,109,Ian,North,15.0,30.0,2025-09-09
9,110,Jack,South,9.0,17.0,2025-09-10


In [72]:
# Quick look at data
print(df.head())
print(df.info())
print(df.describe())

   customer_id customer_name  region sales unit_price         date
0          101        Alice    North    10       25.5   2025-09-01
1          102           Bob   South     5       15.0   2025-09-02
2          103       Charlie    East             20.0   2025-09-03
3          104         David    West    12        NaN   2025-09-04
4          105           Eva   North     7       18.0   2025-09-05
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   customer_id    16 non-null     int64 
 1   customer_name  16 non-null     object
 2   region         16 non-null     object
 3   sales          16 non-null     object
 4   unit_price     14 non-null     object
 5   date           16 non-null     object
dtypes: int64(1), object(5)
memory usage: 896.0+ bytes
None
       customer_id
count    16.000000
mean    108.500000
std       4.760952
min     101.0000

In [73]:
# Clean the data
# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print(df.columns.tolist())

['customer_id', 'customer_name', 'region', 'sales', 'unit_price', 'date']


In [74]:
# check numric and categorical columns
numric_cols = df.select_dtypes(include=["number"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns
print("Numeric columns:", numric_cols.tolist())
print("Categorical columns:", cat_cols.tolist())

Numeric columns: ['customer_id']
Categorical columns: ['customer_name', 'region', 'sales', 'unit_price', 'date']


In [75]:
# Step 4: Convert date column with multiple formats
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d", errors="coerce") # this line tries first format
mask = df["date"].isna() # this line creates a mask for rows where conversion failed
df.loc[mask, "date"] = pd.to_datetime(df.loc[mask, "date"], format="%m/%d/%Y", errors="coerce")  # this line tries second format for those rows
df

Unnamed: 0,customer_id,customer_name,region,sales,unit_price,date
0,101,Alice,North,10.0,25.5,NaT
1,102,Bob,South,5.0,15.0,2025-09-02
2,103,Charlie,East,,20.0,2025-09-03
3,104,David,West,12.0,,2025-09-04
4,105,Eva,North,7.0,18.0,2025-09-05
5,106,Frank,South,8.0,22.0,2025-09-06
6,107,Grace,East,6.0,19.5,2025-09-07
7,108,Helen,West,,,2025-09-08
8,109,Ian,North,15.0,30.0,2025-09-09
9,110,Jack,South,9.0,17.0,2025-09-10


In [76]:
# Handling missing values
# - Drop rows where critical info is missing (region or sales)
df.dropna(subset=["region", "sales"],inplace=True)
df

Unnamed: 0,customer_id,customer_name,region,sales,unit_price,date
0,101,Alice,North,10.0,25.5,NaT
1,102,Bob,South,5.0,15.0,2025-09-02
2,103,Charlie,East,,20.0,2025-09-03
3,104,David,West,12.0,,2025-09-04
4,105,Eva,North,7.0,18.0,2025-09-05
5,106,Frank,South,8.0,22.0,2025-09-06
6,107,Grace,East,6.0,19.5,2025-09-07
7,108,Helen,West,,,2025-09-08
8,109,Ian,North,15.0,30.0,2025-09-09
9,110,Jack,South,9.0,17.0,2025-09-10


In [77]:
# Clean text columns
df["customer_name"] = df["customer_name"].str.strip().str.title() 
df["region"] = df["region"].str.strip().str.capitalize()
df

Unnamed: 0,customer_id,customer_name,region,sales,unit_price,date
0,101,Alice,North,10.0,25.5,NaT
1,102,Bob,South,5.0,15.0,2025-09-02
2,103,Charlie,East,,20.0,2025-09-03
3,104,David,West,12.0,,2025-09-04
4,105,Eva,North,7.0,18.0,2025-09-05
5,106,Frank,South,8.0,22.0,2025-09-06
6,107,Grace,East,6.0,19.5,2025-09-07
7,108,Helen,West,,,2025-09-08
8,109,Ian,North,15.0,30.0,2025-09-09
9,110,Jack,South,9.0,17.0,2025-09-10


In [79]:
# Strip whitespace and convert to numeric
df["sales"] = pd.to_numeric(df["sales"].astype(str).str.strip(), errors="coerce")
df["unit_price"] = pd.to_numeric(df["unit_price"].astype(str).str.strip(), errors="coerce")

df



Unnamed: 0,customer_id,customer_name,region,sales,unit_price,date
0,101,Alice,North,10.0,25.5,NaT
1,102,Bob,South,5.0,15.0,2025-09-02
2,103,Charlie,East,,20.0,2025-09-03
3,104,David,West,12.0,,2025-09-04
4,105,Eva,North,7.0,18.0,2025-09-05
5,106,Frank,South,8.0,22.0,2025-09-06
6,107,Grace,East,6.0,19.5,2025-09-07
7,108,Helen,West,,,2025-09-08
8,109,Ian,North,15.0,30.0,2025-09-09
9,110,Jack,South,9.0,17.0,2025-09-10


In [None]:
# Create revenue column
df["revenue"] = df["sales"] * df["unit_price"].fillna(1)  # use 1 if unit_price missing
df

Unnamed: 0,customer_id,customer_name,region,sales,unit_price,date,revenue
0,101,Alice,North,10.0,25.5,NaT,255.0
1,102,Bob,South,5.0,15.0,2025-09-02,75.0
2,103,Charlie,East,,20.0,2025-09-03,
3,104,David,West,12.0,,2025-09-04,12.0
4,105,Eva,North,7.0,18.0,2025-09-05,126.0
5,106,Frank,South,8.0,22.0,2025-09-06,176.0
6,107,Grace,East,6.0,19.5,2025-09-07,117.0
7,108,Helen,West,,,2025-09-08,
8,109,Ian,North,15.0,30.0,2025-09-09,450.0
9,110,Jack,South,9.0,17.0,2025-09-10,153.0


In [83]:
# Drop rows where sales is still NaN
df = df.dropna(subset=["sales"])
df

Unnamed: 0,customer_id,customer_name,region,sales,unit_price,date,revenue
0,101,Alice,North,10.0,25.5,NaT,255.0
1,102,Bob,South,5.0,15.0,2025-09-02,75.0
3,104,David,West,12.0,,2025-09-04,12.0
4,105,Eva,North,7.0,18.0,2025-09-05,126.0
5,106,Frank,South,8.0,22.0,2025-09-06,176.0
6,107,Grace,East,6.0,19.5,2025-09-07,117.0
8,109,Ian,North,15.0,30.0,2025-09-09,450.0
9,110,Jack,South,9.0,17.0,2025-09-10,153.0
10,111,Kate,East,4.0,12.5,2025-09-11,50.0
11,112,Liam,West,11.0,24.0,2025-09-12,264.0


In [84]:
#  Aggregation
# Total revenue by region
revenue_by_region = df.groupby("region")["revenue"].sum().reset_index()

# Sort by highest revenue
revenue_by_region = revenue_by_region.sort_values(by="revenue", ascending=False)

print(revenue_by_region)

  region  revenue
1  North    831.0
2  South    677.0
3   West    461.0
0   East    174.0


In [85]:
# Save to CSV
revenue_by_region.to_csv("revenue_by_region.csv", index=False)