In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the scraped data
df = pd.read_csv('../data/telegram_scraped_data.csv')

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Check the basic information of the dataset
print("\nDataset Information:")
df.info()

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# Visualizing Missing Data Heatmap
if df.isnull().values.any():
    plt.figure(figsize=(10,6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Data Heatmap')
    plt.show()
else:
    print("\nNo missing values to plot.")

# Descriptive statistics
print("\nDescriptive Statistics of Numerical Columns:")
print(df.describe())

# Checking column types and converting to appropriate formats if needed
# Convert columns to numeric or datetime if they seem to be wrongly inferred
for col in df.columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        pass  # Ignore conversion errors (non-numeric columns)
    try:
        df[col] = pd.to_datetime(df[col])
    except ValueError:
        pass  # Ignore conversion errors (non-datetime columns)

# Checking again for numerical and datetime columns
numerical_columns = df.select_dtypes(include=['number', 'datetime']).columns

# Visualizing the distribution of numerical columns
if len(numerical_columns) > 0:
    df[numerical_columns].hist(figsize=(10,10), bins=20, color='blue')
    plt.suptitle('Distribution of Numerical Columns')
    plt.show()
else:
    print("\nNo numerical or datetime columns available for histogram.")

# Correlation Matrix (if numerical columns exist)
if len(numerical_columns) > 0:
    plt.figure(figsize=(12, 8))
    sns.heatmap(df[numerical_columns].corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.show()
else:
    print("\nNo numerical columns to plot correlation matrix.")

# Checking for duplicates
duplicate_rows = df[df.duplicated()]
print(f"\nNumber of duplicate rows: {len(duplicate_rows)}")

# Dropping duplicates if any
df.drop_duplicates(inplace=True)

# Save the cleaned dataset
df.to_csv('../data/telegram_scrapped_data_cleaned.csv', index=False)
print("\nCleaned data saved as 'telegram_scrapped_data_cleaned.csv'")

# Summary
print("\nFinal dataset shape after cleaning:")
print(df.shape)
print("\nEDA Completed.")
