# Preprocessing
This file contains some preprocessing steps

## Load the data

In [None]:
import pandas as pd

news_articles = pd.read_csv("../dataset/data.csv")

print(news_articles.dtypes, news_articles.shape)
news_articles.head()

## Deeper data evaluation

In [None]:
# check for missing values
print(news_articles.isnull().sum())

# Understand the data
print("Unique label values:", news_articles.label.unique())
print("Unique subject values:", news_articles.subject.unique())

# Check for empty strings
empty_title = news_articles[news_articles['title'].str.strip() == '']
empty_text = news_articles[news_articles['text'].str.strip() == '']
empty_subject = news_articles[news_articles['subject'].str.strip() == '']
empty_date = news_articles[news_articles['date'].str.strip() == '']

print(f"Empty titles: {len(empty_title)}")
print(f"Empty texts: {len(empty_text)}")
print(f"Empty subjects: {len(empty_subject)}")
print(f"Empty dates: {len(empty_date)}")


# Check for duplicated articles
print("Duplicated entries:", news_articles.duplicated(subset=["label", 'title', 'text']).sum())


## Data cleaning
### Basic data cleaning

In [None]:
from sklearn.preprocessing import LabelEncoder

# Remove duplicates and keep the first one
news_articles = news_articles.drop_duplicates(subset=["label", 'title', 'text'])

# Remove articles with empty text
news_articles = news_articles[news_articles["text"].str.strip() != ""]

# One Hot encoding for subject column
# news_articles = pd.get_dummies(news_articles, columns=["subject"])

le = LabelEncoder()
news_articles['subject'] = le.fit_transform(news_articles['subject'])

# Combine both text columns
news_articles["title_text"] = news_articles["title"] + news_articles["text"]
news_articles.drop(columns=["title", "text"], inplace=True)

news_articles.shape

### Handle dates

In [None]:
from datetime import datetime

#  Strip empty spaces from date values
news_articles['date'] = news_articles['date'].str.strip()

def parse_date(date_text):
    """
    Attempts to parse a date string into a datetime object using multiple formats.
    If none of the formats match, returns pd.NaT to indicate a missing datetime.
    
    Parameters:
        date_text (str): The date string to parse.
        
    Returns:
        datetime or pd.NaT (Not a time)
    """
    for format in ("%B %d, %Y", "%b %d, %Y", "%d-%b-%y"):
        try:
            return datetime.strptime(date_text, format)
        except Exception:
            # Try next format
            continue
    return pd.NaT

# Convert date column to date-time format
news_articles['date'] = news_articles["date"].apply(parse_date)

# Remove rows with non parsable date format
news_articles = news_articles[news_articles["date"].isna() == False]

# Handle date column and prevent temporal leakage
news_articles['year'] = news_articles['date'].dt.year
news_articles['month'] = news_articles['date'].dt.month
news_articles['day'] = news_articles['date'].dt.day

news_articles.drop(columns=["date"], inplace=True)

news_articles.head()

### Corrolation Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = news_articles.corr(numeric_only=True)

# Heatmap visualisieren
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=True)
plt.title("Corrolation matrix")
plt.show()

In [None]:
sns.countplot(x='year', hue='label', data=news_articles)
plt.title('Labels per year')
plt.show()

sns.countplot(x='subject', hue='label', data=news_articles)
plt.title('Subject per Year')
plt.show()

In [None]:
# Each subject is always assigned to exactly one label. This correlation was not recognizable in the matrix, but in the countplot.
# This means that the subject must be removed.
news_articles.drop(columns=["subject"], inplace=True)

valid_years = [2016, 2017]

# Filter out other years
news_articles = news_articles[news_articles['year'].isin(valid_years)]

news_articles.shape

## Train / Test Split
Perform train / test split before text preprocessing to avoid data leakage.<br />
This ensures that no information from the test set influences the preprocessing steps.

In [None]:
from sklearn.model_selection import train_test_split

y = news_articles["label"]
X = news_articles.drop(columns=["label"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0, shuffle=True)

## Preprocess text columns

In [47]:
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()

def text_preprocessing_pipeline(text):
    """
    Preprocesses a text string by applying standard NLP cleaning steps:
    tokenization, stop word removal, punctuation removal, and lemmatization.

    Parameters:
        text (str): The input text string to preprocess.

    Returns:
        str: A cleaned and lemmatized string with tokens joined by spaces.
    """
    # Tokenize the text
    tokenized_text = word_tokenize(text.lower())

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [w for w in tokenized_text if w.lower() not in stop_words]

    # Remove punctuation
    filtered_tokens = [w for w in filtered_tokens if w not in string.punctuation]

    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]

    return " ".join(lemmatized_tokens)

X_train["title_text"] = X_train["title_text"].apply(text_preprocessing_pipeline)

## Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of the target variable (label)
print(news_articles['label'].value_counts())
sns.countplot(x='label', data=news_articles)
plt.title('Distribution of Labels (Fake vs Real)')
plt.show()

# Add a new column 'text_length'
news_articles['text_length'] = news_articles['title_text'].apply(lambda x: len(x))

# Length of the texts (tokens)
sns.histplot(news_articles['text_length'], bins=50)
plt.title('Distribution of Text Lengths')
plt.show()

# Remove column, it is not needed anymore
news_articles.drop(columns=["text_length"], inplace=True)