# 01 â€” EDA CNH-PSX (Mendeley)
Exploratory Data Analysis of the main dataset: categorized PSX news headlines.
**Goal**: understand the structure, quality, and distribution of the data before moving to preprocessing.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load both Mendeley CSV files
# Download them from https://data.mendeley.com/datasets/mc4s7zvx9c/1
# and place them in data/raw/
df_v1 = pd.read_csv('../data/raw/CNH-PSX_Ver1.csv')
df_v2 = pd.read_csv('../data/raw/CNH-PSX_Ver2.csv')

print('V1 shape:', df_v1.shape)
print('V2 shape:', df_v2.shape)

In [None]:
# Column overview
print('--- V1 ---')
print(df_v1.head())
print(df_v1.dtypes)
print('\n--- V2 ---')
print(df_v2.head())
print(df_v2.dtypes)

In [None]:
# Missing values
print('Nulls V1:\n', df_v1.isnull().sum())
print('Nulls V2:\n', df_v2.isnull().sum())

In [None]:
# Category distribution (adjust column name if needed)
cat_col = 'category'  # adjust to actual column name
if cat_col in df_v2.columns:
    df_v2[cat_col].value_counts().plot(kind='bar', figsize=(10, 4), title='Category Distribution (V2)')
    plt.tight_layout()
    plt.show()

In [None]:
# Temporal distribution
date_col = 'date'  # adjust if needed
if date_col in df_v2.columns:
    df_v2[date_col] = pd.to_datetime(df_v2[date_col], errors='coerce')
    df_v2.set_index(date_col).resample('M').size().plot(figsize=(12, 4), title='Articles per Month')
    plt.tight_layout()
    plt.show()

In [None]:
# Headline length distribution
text_col = 'headline'  # adjust if needed
if text_col in df_v2.columns:
    df_v2['headline_len'] = df_v2[text_col].str.split().str.len()
    df_v2['headline_len'].hist(bins=30, figsize=(8, 4))
    plt.title('Headline Length Distribution (words)')
    plt.show()
    print(df_v2['headline_len'].describe())

In [None]:
# Duplicates
if text_col in df_v2.columns:
    dupes = df_v2[text_col].duplicated().sum()
    print(f'Duplicate headlines in V2: {dupes}')

## EDA Conclusions
- [ ] Available columns: ...
- [ ] Date range: ...
- [ ] Main categories: ...
- [ ] Duplicates to clean: ...
- [ ] Average headline length: ...
- [ ] Key points to address in preprocessing: ...