# Steps Explained:
1. Set the Working Directory: Set and verify the working directory.
2. Load the Dataset: Load the dataset and display the first few rows.
3. Basic Information: Display basic information about the dataset.
4. Summary Statistics: Display summary statistics of the dataset.
5. Handling Missing Values: Display the number of missing values for each column; Visualize missing values using a heatmap; Fill or drop missing values.
6. Handling Duplicate Rows: Display the number of duplicate rows; Drop duplicate rows.
7. Handling Outliers: Visualize numerical features to detect outliers; Remove outliers using Z-score or other methods.
8. Converting Data Types: Display the data types of each column; Convert data types if necessary.
9. Handling Categorical Data: Display unique values for categorical features; Encode categorical variables using one-hot encoding or label encoding.
10. Feature Scaling: Normalize or standardize numerical features.
11. Save Cleaned Dataset: Save the cleaned dataset to a new file.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set the working directory
working_directory = 'your_directory_path'  # Replace with your desired directory path
os.chdir(working_directory)

In [None]:
# Load the dataset
# df = pd.read_csv('your_dataset.csv')  # Load your dataset here
# df = pd.read_excel('your_dataset.xlsx')  # Alternatively, load an Excel file

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Display basic information about the dataset
print("\nBasic Information:")
print(df.info())

# Display summary statistics of the dataset
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# Handling Missing Values
# Display the number of missing values for each column
print("\nMissing Values:")
print(df.isnull().sum())

# Visualize missing values using a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# Handle missing values
# Example: Fill missing values with the mean of the column
# df.fillna(df.mean(), inplace=True)

# Example: Fill missing values with the median of the column
# df.fillna(df.median(), inplace=True)

# Example: Fill missing values with the mode of the column
# df.fillna(df.mode().iloc[0], inplace=True)

# Example: Drop rows with missing values
# df.dropna(inplace=True)


In [None]:
# Handling Duplicate Rows
# Display the number of duplicate rows
print("\nDuplicate Rows:")
print(df.duplicated().sum())

# Drop duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Handling Outliers
# Visualize numerical features to detect outliers
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df[feature])
    plt.title(f'Box Plot of {feature}')
    plt.show()

# Handle outliers
# Example: Remove outliers using Z-score
from scipy import stats
z_scores = np.abs(stats.zscore(df[numerical_features]))
df = df[(z_scores < 3).all(axis=1)]

In [None]:
# Converting Data Types
# Display the data types of each column
print("\nData Types:")
print(df.dtypes)

# Convert data types if necessary
# Example: Convert a column to datetime
# df['date_column'] = pd.to_datetime(df['date_column'])

# Example: Convert a column to a specific type
# df['numeric_column'] = df['numeric_column'].astype(float)

In [None]:
# Handling Categorical Data
# Display unique values for categorical features
categorical_features = df.select_dtypes(include=[object]).columns.tolist()
for feature in categorical_features:
    print(f"\nUnique values in {feature}:")
    print(df[feature].unique())

# Encoding categorical variables
# Example: One-hot encoding
# df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Example: Label encoding
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# for feature in categorical_features:
#     df[feature] = le.fit_transform(df[feature])

# Feature Scaling
# Normalize or standardize numerical features
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Example: Standardization
# scaler = StandardScaler()
# df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Example: Normalization
# scaler = MinMaxScaler()
# df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Save the cleaned dataset to a new file
# df.to_csv('cleaned_dataset.csv', index=False)

print("\nData cleaning complete. Cleaned dataset saved (if applicable).")