**Importing libaries for exploratory data analysis**

In [2]:
import pandas as pd
import numpy as np

**Loading and understanding the dataset**

In [3]:
df = pd.read_csv('/content/business.retailsales.csv')

In [4]:
df.shape

(1775, 6)

In [5]:
df.head(6)

Unnamed: 0,Product Type,Net Quantity,Gross Sales,Discounts,Returns,Total Net Sales
0,Art & Sculpture,34,14935.0,-594.0,-1609.0,12732.0
1,Basket,13,3744.0,-316.8,0.0,3427.2
2,Basket,12,3825.0,-201.6,-288.0,3335.4
3,Basket,17,3035.0,-63.25,0.0,2971.75
4,Art & Sculpture,47,2696.8,-44.16,0.0,2652.64
5,Basket,17,2695.0,-52.5,-110.0,2532.5


**Performing data cleaning activities**

In [6]:
#Check for missing data
missing_data = df.isnull().sum()
print("Missing data in each column:")
print(missing_data)

Missing data in each column:
Product Type       8
Net Quantity       0
Gross Sales        0
Discounts          0
Returns            0
Total Net Sales    0
dtype: int64


In [7]:
#Adding the $ sign to specified columns
columns_to_format = ['Gross Sales', 'Discounts', 'Returns', 'Total Net Sales']
for column in columns_to_format:
    if column in df.columns:
        df[column] = df[column].apply(lambda x: f"${x:.2f}" if pd.notnull(x) else x)


In [8]:
# Check for duplicates in each column
duplicates_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates_count}")

# Remove duplicate rows
df = df.drop_duplicates()


Number of duplicate rows: 512


In [12]:
# Rechecking missing data after duplicate removal
missing_data = df.isnull().sum()
print("\nMissing data after removing duplicates:")
print(missing_data)

# Dropping rows with missing data
df_cleaned = df.dropna()


Missing data after removing duplicates:
Product Type       7
Net Quantity       0
Gross Sales        0
Discounts          0
Returns            0
Total Net Sales    0
dtype: int64


In [13]:
# Saving the cleaned dataset to a CSV file
cleaned_file_path = 'cleaned_dataset.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved to {cleaned_file_path}")


Cleaned dataset saved to cleaned_dataset.csv
