In [2]:
import pandas as pd
df = pd.read_csv("sales_data.csv")


In [3]:
print("Original dataset:")
print(df)
print(f"\nShape: {df.shape}")
print(f"\nMissing values:\n{df.isnull().sum()}")


Original dataset:
   order_id region   product  quantity  price
0      1001   East  Keyboard       2.0   1500
1      1002   West     Mouse       5.0    500
2      1003   East   Monitor       NaN  12000
3      1004  South  Keyboard       1.0   1500
4      1005   West   Monitor       2.0  12000

Shape: (5, 5)

Missing values:
order_id    0
region      0
product     0
quantity    1
price       0
dtype: int64


In [5]:
filtered_df = df.loc[(df["region"] == "West") & (df["quantity"] > 1)]
#verify filtered data set
print("Filtered dataset:")
print(filtered_df)


Filtered dataset:
   order_id region  product  quantity  price
1      1002   West    Mouse       5.0    500
4      1005   West  Monitor       2.0  12000


In [6]:
# handle missing values
print("Missing values before cleaning:")
print(df.isnull().sum())

clean_df = df.dropna(subset=["quantity"])
print("Missing values after cleaning:")
print(clean_df.isnull().sum())
print(f"\nRows before: {len(df)}, Rows after: {len(clean_df)}")


Missing values before cleaning:
order_id    0
region      0
product     0
quantity    1
price       0
dtype: int64
Missing values after cleaning:
order_id    0
region      0
product     0
quantity    0
price       0
dtype: int64

Rows before: 5, Rows after: 4


In [None]:
#check for duplicates
print("Duplicate rows:")
print(clean_df[clean_df.duplicated()])
print(f"\nTotal duplicates: {clean_df.duplicated().sum()}")

#remove duplicates
clean_df = clean_df.drop_duplicates()
#print rows after remove duplicates
print(f"Rows after removing duplicates: {len(clean_df)}")


Duplicate rows:

Total duplicates: 0
Rows after removing duplicates: 4


In [9]:
#save clean data to csv
clean_df.to_csv("clean_sales_data.csv", index=False)
#verify file was created
print("Clean dataset saved to: clean_sales_data.csv")
print(f"Final shape: {clean_df.shape}")


Clean dataset saved to: clean_sales_data.csv
Final shape: (4, 5)


In [11]:
#complete cleaning function 
def clean_sales_data(input_file, output_file):
    """
    Clean sales data by filtering, handling missing values, and removing duplicates.
    """
    # Load data
    df = pd.read_csv(input_file)
    
    # Filter: region == "West" and quantity > 1
    filtered_df = df.loc[(df["region"] == "West") & (df["quantity"] > 1)]
    
    # Handle missing values: drop rows with missing quantity
    clean_df = filtered_df.dropna(subset=["quantity"])
    
    # Remove duplicates
    clean_df = clean_df.drop_duplicates()
    
    # Save cleaned data
    clean_df.to_csv(output_file, index=False)
    
    print(f"Cleaning complete!")
    print(f"Original rows: {len(df)}")
    print(f"Cleaned rows: {len(clean_df)}")
    print(f"Output saved to: {output_file}")
    
    return clean_df


In [12]:

#execute the function
clean_df = clean_sales_data("sales_data.csv", "clean_sales_data.csv")


Cleaning complete!
Original rows: 5
Cleaned rows: 2
Output saved to: clean_sales_data.csv
