In [1]:
import os

# Create a "data" folder if it doesn’t exist
os.makedirs("data", exist_ok=True)

# Sample sales dataset (10 rows)
sample = """order_id,customer_id,order_date,product,quantity,unit_price
1,1001,2024-01-01,Widget A,2,9.99
2,1002,2024-01-03,Widget B,1,19.99
3,1001,2024-01-07,Widget C,5,4.50
4,1003,2024-02-10,Widget A,3,9.99
5,1004,2024-02-15,Widget B,2,19.99
6,1002,2024-03-01,Widget D,1,29.99
7,1005,2024-03-05,Widget A,10,9.99
8,1001,2024-03-20,Widget B,4,19.99
9,1006,2024-04-02,Widget C,2,4.50
10,1007,2024-04-15,Widget D,3,29.99
"""

# Save into CSV file
with open("data/sales.csv", "w") as f:
    f.write(sample)

print("✅ data/sales.csv created")


✅ data/sales.csv created


In [2]:
import pandas as pd

# Load CSV into a DataFrame (like Excel in Python)
df = pd.read_csv("data/sales.csv", parse_dates=["order_date"])

# Show shape (rows, columns)
print("Shape:", df.shape)

# Show all 10 rowsb
df


Shape: (10, 6)


Unnamed: 0,order_id,customer_id,order_date,product,quantity,unit_price
0,1,1001,2024-01-01,Widget A,2,9.99
1,2,1002,2024-01-03,Widget B,1,19.99
2,3,1001,2024-01-07,Widget C,5,4.5
3,4,1003,2024-02-10,Widget A,3,9.99
4,5,1004,2024-02-15,Widget B,2,19.99
5,6,1002,2024-03-01,Widget D,1,29.99
6,7,1005,2024-03-05,Widget A,10,9.99
7,8,1001,2024-03-20,Widget B,4,19.99
8,9,1006,2024-04-02,Widget C,2,4.5
9,10,1007,2024-04-15,Widget D,3,29.99


In [3]:
# Info about types & missing values
print(df.info())
print("\nMissing values per column:")
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   order_id     10 non-null     int64         
 1   customer_id  10 non-null     int64         
 2   order_date   10 non-null     datetime64[ns]
 3   product      10 non-null     object        
 4   quantity     10 non-null     int64         
 5   unit_price   10 non-null     float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 612.0+ bytes
None

Missing values per column:
order_id       0
customer_id    0
order_date     0
product        0
quantity       0
unit_price     0
dtype: int64


In [5]:
# Remove duplicate rows
df = df.drop_duplicates()

# Ensure correct types
df['quantity'] = pd.to_numeric(df['quantity'], errors='coerce').fillna(0).astype(int)
df['unit_price'] = pd.to_numeric(df['unit_price'], errors='coerce').fillna(0.0)
df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')

print("✅")

✅


In [6]:
# Revenue per order
df['revenue'] = df['quantity'] * df['unit_price']

# Month of order
df['order_month'] = df['order_date'].dt.to_period('M')

df.head(10)


Unnamed: 0,order_id,customer_id,order_date,product,quantity,unit_price,revenue,order_month
0,1,1001,2024-01-01,Widget A,2,9.99,19.98,2024-01
1,2,1002,2024-01-03,Widget B,1,19.99,19.99,2024-01
2,3,1001,2024-01-07,Widget C,5,4.5,22.5,2024-01
3,4,1003,2024-02-10,Widget A,3,9.99,29.97,2024-02
4,5,1004,2024-02-15,Widget B,2,19.99,39.98,2024-02
5,6,1002,2024-03-01,Widget D,1,29.99,29.99,2024-03
6,7,1005,2024-03-05,Widget A,10,9.99,99.9,2024-03
7,8,1001,2024-03-20,Widget B,4,19.99,79.96,2024-03
8,9,1006,2024-04-02,Widget C,2,4.5,9.0,2024-04
9,10,1007,2024-04-15,Widget D,3,29.99,89.97,2024-04


In [7]:
# Total revenue
total_revenue = df['revenue'].sum()

# Top 5 customers
top_customers = df.groupby('customer_id')['revenue'].sum().sort_values(ascending=False).head(5)

# Monthly revenue
monthly_revenue = df.groupby('order_month')['revenue'].sum()

print("\nTotal revenue:", total_revenue)
print("\nTop customers by revenue:")
print(top_customers)
print("\nMonthly revenue:")
print(monthly_revenue)



Total revenue: 441.24

Top customers by revenue:
customer_id
1001    122.44
1005     99.90
1007     89.97
1002     49.98
1004     39.98
Name: revenue, dtype: float64

Monthly revenue:
order_month
2024-01     62.47
2024-02     69.95
2024-03    209.85
2024-04     98.97
Freq: M, Name: revenue, dtype: float64
