In [1]:
import pandas as pd
import numpy as np
import os 

In [2]:
orders = pd.read_csv("List of Orders.csv")

In [3]:
orders.head()

Unnamed: 0,Order ID,Order Date,CustomerName,State,City
0,B-25601,01-04-2018,Bharat,Gujarat,Ahmedabad
1,B-25602,01-04-2018,Pearl,Maharashtra,Pune
2,B-25603,03-04-2018,Jahan,Madhya Pradesh,Bhopal
3,B-25604,03-04-2018,Divsha,Rajasthan,Jaipur
4,B-25605,05-04-2018,Kasheen,West Bengal,Kolkata


In [4]:
orders.shape

(560, 5)

In [5]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Order ID      500 non-null    object
 1   Order Date    500 non-null    object
 2   CustomerName  500 non-null    object
 3   State         500 non-null    object
 4   City          500 non-null    object
dtypes: object(5)
memory usage: 22.0+ KB


In [6]:
orders.isna().sum()

Order ID        60
Order Date      60
CustomerName    60
State           60
City            60
dtype: int64

In [7]:
orders.columns = (
    orders.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

# Verify
orders.columns


Index(['order_id', 'order_date', 'customername', 'state', 'city'], dtype='object')

In [8]:
orders["order_date"] = pd.to_datetime(
    orders["order_date"],
    format="%d-%m-%Y",
    errors="coerce"
)

# Verify conversion
orders.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560 entries, 0 to 559
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   order_id      500 non-null    object        
 1   order_date    500 non-null    datetime64[ns]
 2   customername  500 non-null    object        
 3   state         500 non-null    object        
 4   city          500 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 22.0+ KB


In [9]:
# Remove rows with missing critical identifiers

orders = orders.dropna(subset=["order_id", "order_date"])

In [10]:
# Checks duplicates order_id's

orders["order_id"].duplicated().sum()

np.int64(0)

In [11]:
# Drop duplicate order_id's

orders = orders.drop_duplicates(subset="order_id")

In [12]:
# Trim Text Columns

text_cols = ["customername", "state", "city"]

for col in text_cols:
    orders[col] = orders[col].astype(str).str.strip()


In [13]:
# Final dataset overview

orders.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   order_id      500 non-null    object        
 1   order_date    500 non-null    datetime64[ns]
 2   customername  500 non-null    object        
 3   state         500 non-null    object        
 4   city          500 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 23.4+ KB


In [14]:
# Primary key validation

orders["order_id"].nunique() == orders.shape[0]

True

In [15]:
os.makedirs("cleaned_data", exist_ok=True)

orders.to_csv("cleaned_data/orders_cleaned.csv", index=False)