In [6]:
import pandas as pd

# Load datasets
mine_df = pd.read_csv('Mine.csv')
sales_df = pd.read_csv('Sales.csv')

In [7]:
# Display initial information about datasets
print("Mine Dataset Info:")
print(mine_df.info())
print("\nSales Dataset Info:")
print(sales_df.info())

Mine Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  31 non-null     int64  
 1   Date      29 non-null     object 
 2   Pulse     30 non-null     float64
 3   Maxpulse  29 non-null     float64
 4   Calories  27 non-null     float64
dtypes: float64(3), int64(1), object(1)
memory usage: 1.3+ KB
None

Sales Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Order ID       8 non-null      int64  
 1   Customer Name  7 non-null      object 
 2   Order Date     8 non-null      object 
 3   Product        8 non-null      object 
 4   Quantity       7 non-null      float64
 5   Unit Price     7 non-null      float64
 6   Total Revenue  7 non-null      float64
dtypes: float64(3), in

In [8]:
# Data Cleaning Process for Mine.csv
# 1. Drop rows with Missing Values
mine_df.dropna(inplace=True)

In [10]:
# 2. Standardize Date Formats
mine_df['Date'] = pd.to_datetime(mine_df['Date'], errors='coerce')  # Coerce errors to NaT for invalid formats

In [11]:
# 3. Remove Duplicate Rows
mine_df.drop_duplicates(inplace=True)

In [12]:
# 4. Handling Wrong Data
# Example: Correcting 'Date' column entries that are not in the correct format
# In this dataset, there are incorrect dates like '20231018' which should be '2023/10/18'
mine_df['Date'] = pd.to_datetime(mine_df['Date'], errors='coerce', format='%Y/%m/%d')

In [13]:
# 5. Drop Unnecessary Columns
columns_to_drop = ['unnecessary_column1', 'unnecessary_column2']
mine_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')  # errors='ignore' to handle if column doesn't exist

In [15]:
# Save cleaned dataset
mine_df.to_csv('cleaned_mine.csv', index=False)

In [16]:
print("\nCleaned Sales Dataset Info:")
print(sales_df.info())


Cleaned Sales Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Order ID       8 non-null      int64  
 1   Customer Name  7 non-null      object 
 2   Order Date     8 non-null      object 
 3   Product        8 non-null      object 
 4   Quantity       7 non-null      float64
 5   Unit Price     7 non-null      float64
 6   Total Revenue  7 non-null      float64
dtypes: float64(3), int64(1), object(3)
memory usage: 580.0+ bytes
None


In [18]:
# Data Cleaning Process for Mine.csv
# Convert relevant columns to numeric
numeric_columns = ['Quantity', 'Unit Price', 'Total Revenue']
sales_df[numeric_columns] = sales_df[numeric_columns].apply(pd.to_numeric, errors='coerce')

In [19]:
# Drop rows with any NaN values
sales_df.dropna(inplace=True)

In [20]:
# Standardize Date Formats if needed
# Example: Convert 'Order Date' to datetime format
sales_df['Order Date'] = pd.to_datetime(sales_df['Order Date'], errors='coerce', format='%d/%m/%Y')

In [21]:
# Remove Duplicate Rows
sales_df.drop_duplicates(inplace=True)

In [22]:
# Correcting Wrong Data if needed
# Example: Ensure Quantity values are non-negative
sales_df['Quantity'] = sales_df['Quantity'].apply(lambda x: abs(x) if x < 0 else x)

In [23]:
# Save cleaned dataset
sales_df.to_csv('cleaned_sales.csv', index=False)

In [24]:
# Display cleaned dataset info
print("\nCleaned Sales Dataset Info:")
print(sales_df.info())


Cleaned Sales Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 0 to 7
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Order ID       5 non-null      int64         
 1   Customer Name  5 non-null      object        
 2   Order Date     4 non-null      datetime64[ns]
 3   Product        5 non-null      object        
 4   Quantity       5 non-null      float64       
 5   Unit Price     5 non-null      float64       
 6   Total Revenue  5 non-null      float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(2)
memory usage: 320.0+ bytes
None
