**Importing libaries for exploratory data analysis**

In [1]:
import pandas as pd
import numpy as np

**Loading and understanding the dataset**

In [3]:
df = pd.read_csv('dangote-biz-analytics.csv')

In [4]:
df.shape

(1775, 7)

In [5]:
df.head(6)

Unnamed: 0,DATE,Product Type,Production Capacity Per day/(Liters),Production Volume per day/(Liters),Total Revenue,Operating Costs (USD),Net Profit (USD)
0,1/12/2023,Aviation Fuel,90000,240000,7533000.0,86000,7447000.0
1,1/12/2023,Aviation Fuel,630000,310000,52731000.0,86000,52645000.0
2,1/12/2023,Aviation Fuel,90000,185000,7533000.0,86000,7447000.0
3,1/12/2023,Black oil,540000,240000,39690000.0,82000,39608000.0
4,1/12/2023,Black oil,630000,310000,46305000.0,82000,46223000.0
5,1/12/2023,Black oil,90000,185000,6615000.0,82000,6533000.0


**Performing data cleaning activities**

In [6]:
#Check for missing data
missing_data = df.isnull().sum()
print("Missing data in each column:")
print(missing_data)

Missing data in each column:
DATE                                    0
Product Type                            8
Production Capacity Per day/(Liters)    0
Production Volume per day/(Liters)      0
Total Revenue                           0
Operating Costs (USD)                   0
Net Profit (USD)                        0
dtype: int64


In [7]:
#Adding the $ sign to specified columns
columns_to_format = ['Gross Sales', 'Discounts', 'Returns', 'Total Net Sales']
for column in columns_to_format:
    if column in df.columns:
        df[column] = df[column].apply(lambda x: f"${x:.2f}" if pd.notnull(x) else x)


In [8]:
# Check for duplicates in each column
duplicates_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates_count}")

# Remove duplicate rows
df = df.drop_duplicates()


Number of duplicate rows: 383


In [9]:
# Rechecking missing data after duplicate removal
missing_data = df.isnull().sum()
print("\nMissing data after removing duplicates:")
print(missing_data)

# Dropping rows with missing data
df_cleaned = df.dropna()


Missing data after removing duplicates:
DATE                                    0
Product Type                            8
Production Capacity Per day/(Liters)    0
Production Volume per day/(Liters)      0
Total Revenue                           0
Operating Costs (USD)                   0
Net Profit (USD)                        0
dtype: int64


In [11]:
# Saving the cleaned dataset to a CSV file
cleaned_file_path = 'cleaned_dataset.csv'
df_cleaned.to_csv(cleaned_file_path, index=False)
print(f"\nCleaned dataset saved to {cleaned_file_path}")


Cleaned dataset saved to cleaned_dataset.csv
