In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
# Setting working directory
#working_directory = 'your_directory_path'
#os.chdir(working_directory)

# Load the dataset
# df = pd.read_csv('your_dataset.csv')  # Load your dataset here
# df = pd.read_excel('your_dataset.xlsx')  # Alternatively, load an Excel file

In [None]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Display basic information about the dataset
print("\nBasic Information:")
print(df.info())

# Display summary statistics of the dataset
print("\nSummary Statistics:")
print(df.describe())

In [None]:
# Display the number of missing values for each column
print("\nMissing Values:")
print(df.isnull().sum())

# Visualize missing values using a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# Handle missing values
# Example: Fill missing values with the mean of the column
# df.fillna(df.mean(), inplace=True)

In [None]:
# Display the data types of each column
print("\nData Types:")
print(df.dtypes)

# Convert data types if necessary
# Example: Convert a column to datetime
# df['date_column'] = pd.to_datetime(df['date_column'])

In [None]:
# Visualize the distribution of numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()

In [None]:
# Visualize the distribution of categorical features
categorical_features = df.select_dtypes(include=[object]).columns.tolist()
for feature in categorical_features:
    plt.figure(figsize=(10, 6))
    sns.countplot(y=df[feature])
    plt.title(f'Distribution of {feature}')
    plt.show()

In [None]:
# Correlation matrix to understand relationships between numerical features
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Pair plot to visualize relationships between numerical features
sns.pairplot(df[numerical_features])
plt.show()

In [None]:
# Box plots to detect outliers in numerical features
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df[feature])
    plt.title(f'Box Plot of {feature}')
    plt.show()

# Identify and handle outliers
# Example: Remove outliers using Z-score
z_scores = np.abs(stats.zscore(df[numerical_features]))
df = df[(z_scores < 3).all(axis=1)]

In [None]:
# Group by and aggregation to understand categorical features better
for feature in categorical_features:
    print(f"\nAggregation for {feature}:")
    print(df.groupby(feature).mean())

# Visualize the relationship between categorical and numerical features
for cat_feature in categorical_features:
    for num_feature in numerical_features:
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=df[cat_feature], y=df[num_feature])
        plt.title(f'{num_feature} by {cat_feature}')
        plt.show()

In [None]:
# Save the cleaned and processed data to a new file
# df.to_csv('cleaned_dataset.csv', index=False)

print("\nEDA complete. Cleaned dataset saved (if applicable).")