In [142]:
# importing data from csv file
import pandas as pd
import numpy as np

df = pd.read_csv("sales.csv")
df

Unnamed: 0,Customer ID,Customer Name,REGION,Sales($),Date
0,1,john smith,east,500,2025/01/15
1,2,MARY ANN,,700,2025-02-10
2,3,,West,300,15-03-2025
3,4,Alex,north,,2025-04-20
4,5,Sara,south,450,2025.05.22
5,6,David,EAST,600,
6,7,Kate,West,$550,2025-07-11


In [143]:
# Show the raw column names
print("Before cleaning:", df.columns.tolist())


Before cleaning: ['Customer ID ', ' Customer Name ', 'REGION ', ' Sales($) ', ' Date ']


In [144]:
# Clean them
df.columns = (
    df.columns
      .str.strip()                 # remove leading/trailing spaces
      .str.lower()                 # make lowercase
      .str.replace(" ", "_")       # spaces â†’ underscores
      .str.replace(r"[\(\)\$]", "", regex=True)  # drop ($) or ()
)

print("After cleaning:", df.columns.tolist())

After cleaning: ['customer_id', 'customer_name', 'region', 'sales', 'date']


In [145]:
# Drop rows with critical nulls (Customer Name or Region missing)
df = df.dropna(subset=["customer_name", "region"])
df




Unnamed: 0,customer_id,customer_name,region,sales,date
0,1,john smith,east,500,2025/01/15
1,2,MARY ANN,,700,2025-02-10
2,3,,West,300,15-03-2025
3,4,Alex,north,,2025-04-20
4,5,Sara,south,450,2025.05.22
5,6,David,EAST,600,
6,7,Kate,West,$550,2025-07-11


In [None]:
# Standardize text (title case for names, capitalize region)
# str.title() Converts each word in the string so that the first letter is uppercase and the rest are lowercase.
df["customer_name"] = df["customer_name"].str.strip().str.title() 
df["region"] = df["region"].str.strip().str.capitalize()

df

Unnamed: 0,customer_id,customer_name,region,sales,date
0,1,John Smith,East,500,2025/01/15
1,2,Mary Ann,,700,2025-02-10
2,3,,West,300,15-03-2025
3,4,Alex,North,,2025-04-20
4,5,Sara,South,450,2025.05.22
5,6,David,East,600,
6,7,Kate,West,$550,2025-07-11


In [147]:
df.rename(columns={"sales": "sales_amount"}, inplace=True)
df

Unnamed: 0,customer_id,customer_name,region,sales_amount,date
0,1,John Smith,East,500,2025/01/15
1,2,Mary Ann,,700,2025-02-10
2,3,,West,300,15-03-2025
3,4,Alex,North,,2025-04-20
4,5,Sara,South,450,2025.05.22
5,6,David,East,600,
6,7,Kate,West,$550,2025-07-11


In [148]:
# Clean Sales_Amount (remove $ and convert to numeric)
df["sales_amount"] = (
    df["sales_amount"].astype(str).str.replace(r"[^0-9.]", "", regex=True) # remove non-numeric characters
)
df["sales_amount"] = pd.to_numeric(df["sales_amount"], errors="coerce") # convert to numeric, set errors to NaN
df

Unnamed: 0,customer_id,customer_name,region,sales_amount,date
0,1,John Smith,East,500.0,2025/01/15
1,2,Mary Ann,,700.0,2025-02-10
2,3,,West,300.0,15-03-2025
3,4,Alex,North,,2025-04-20
4,5,Sara,South,450.0,2025.05.22
5,6,David,East,600.0,
6,7,Kate,West,550.0,2025-07-11


#### Python  Error

| Keyword | Type          | Use Case                  | Works for Non-Numeric? | Behavior in Arithmetic |
| ------- | ------------- | ------------------------- | ---------------------- | ---------------------- |
| `NaN`   | float         | Missing numeric data      | No                     | Returns `NaN`          |
| `None`  | NoneType      | General Python null       | Yes                    | Error in arithmetic    |
| `pd.NA` | pandas NAType | Missing data in any dtype | Yes                    | Returns `pd.NA`        |


In [None]:
# Handling Date column (handle multiple formats)

# Step 1: Clean date separators
df["date_clean"] = df["date"].astype(str).str.replace(r"[./]", "-", regex=True) # unify separators to '-'

# Step 2: Convert to datetime
df["date"] = pd.to_datetime(df["date_clean"], errors="coerce", dayfirst=True) # try parsing with day first

# Step 3: Drop intermediate column (date_clean)
df.drop(columns=["date_clean"], inplace=True) # drop intermediate column

df

  df["date"] = pd.to_datetime(df["date_clean"], errors="coerce", dayfirst=True) # try parsing with day first


Unnamed: 0,customer_id,customer_name,region,sales_amount,date
0,1,John Smith,East,500.0,2025-01-15
1,2,Mary Ann,,700.0,2025-02-10
2,3,,West,300.0,NaT
3,4,Alex,North,,2025-04-20
4,5,Sara,South,450.0,2025-05-22
5,6,David,East,600.0,NaT
6,7,Kate,West,550.0,2025-07-11


In [None]:
# Drop rows where Sales_Amount or Date is missing
df = df.dropna(subset=["sales_amount", "date"])
df

Unnamed: 0,customer_id,customer_name,region,sales_amount,date
0,1,John Smith,East,500.0,2025-01-15
1,2,Mary Ann,,700.0,2025-02-10
4,5,Sara,South,450.0,2025-05-22
6,7,Kate,West,550.0,2025-07-11


In [153]:
# Sort by Sales_Amount (descending)
df = df.sort_values(by="sales_amount", ascending=False)

print("\nCleaned DataFrame:")
print(df)


Cleaned DataFrame:
   customer_id customer_name region  sales_amount       date
1            2      Mary Ann                700.0 2025-02-10
6            7          Kate   West         550.0 2025-07-11
0            1    John Smith   East         500.0 2025-01-15
4            5          Sara  South         450.0 2025-05-22


In [154]:
# Step 9: Save cleaned CSV
df.to_csv("sales_cleaned.csv", index=False)