# Initial exploratory data analysis

## 1. Imports and configuration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.config import DATA_FILES

# Set plotting style
sns.set(style="whitegrid")
%load_ext autoreload
%autoreload 2


## 2. Load data

In [None]:
# Load training and test datasets
train_path = DATA_FILES["train_raw"]
test_path = DATA_FILES["test_raw"]

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("âœ… Data loaded successfully.")


## 3. Basic info

In [None]:
# Train set info
print("ðŸ“˜ Training Set Info")
display(train_df.info())
display(train_df.head())

# Test set info
print("ðŸ“— Test Set Info")
display(test_df.info())
display(test_df.head())


## 4. Data shape


In [None]:
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")


## 5. Missing values


In [None]:
print("Missing values (Train):")
display(train_df.isnull().sum())

print("\nMissing values (Test):")
display(test_df.isnull().sum())


## 6. Target column distribution


In [None]:
# Plot class balance
sns.countplot(x="label", data=train_df)
plt.title("Class Distribution in Training Set")
plt.xlabel("Label (0 = Real, 1 = Fake)")
plt.ylabel("Count")
plt.show()

print(train_df["label"].value_counts(normalize=True))


## 7. Text length distribution

In [None]:
# Text length analysis
train_df["text_length"] = train_df["text"].astype(str).apply(len)

sns.histplot(train_df["text_length"], bins=50)
plt.title("Text Length Distribution")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")
plt.show()

train_df["text_length"].describe()


## 8. Sample examples

In [None]:
# View sample real and fake news
print("ðŸ“° Example Real News")
display(train_df[train_df["label"] == 0][["title", "text"]].head(2))

print("ðŸ§¾ Example Fake News")
display(train_df[train_df["label"] == 1][["title", "text"]].head(2))
