In [1]:
import os
import numpy as np
import pandas as pd


In [2]:
%pwd


'c:\\Users\\Achraf\\Desktop\\MASTER\\Python Avancé\\Mini-Projet\\mlops-fakenews-text-classification\\research'

In [3]:
os.chdir("../")


In [4]:
%pwd


'c:\\Users\\Achraf\\Desktop\\MASTER\\Python Avancé\\Mini-Projet\\mlops-fakenews-text-classification'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    fake_news_path: Path
    true_news_path: Path
    processed_x_train: Path
    processed_y_train: Path
    processed_x_test: Path
    processed_y_test: Path
    tfidf_vectoriser: Path
    test_size: float
    random_state: int
    stratify: bool

In [6]:
from fakeNewsClassifier.constants import *
from fakeNewsClassifier.utils.common import read_yaml, create_directories

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Achraf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Achraf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_preprocessing_config(self) -> DataPreprocessingConfig:
        """
        Get configuration for data preprocessing
        
        Returns:
            DataPreprocessingConfig: Configuration for data preprocessing
        """
        config = self.config.data_preprocessing
        
        create_directories([config.root_dir])
        
        data_preprocessing_config = DataPreprocessingConfig(
            root_dir=config.root_dir,
            fake_news_path=config.fake_news_path,
            true_news_path=config.true_news_path,
            processed_x_train=config.processed_x_train,
            processed_y_train=config.processed_y_train,
            processed_x_test=config.processed_x_test,
            processed_y_test=config.processed_y_test,
            tfidf_vectoriser=config.tfidf_vectoriser,
            test_size=self.params.test_size,
            random_state=self.params.random_state,
            stratify=self.params.stratify,
        )
        
        return data_preprocessing_config

In [8]:
import os
import urllib.request as request
from zipfile import ZipFile
from sklearn.linear_model import LogisticRegression
import joblib
from fakeNewsClassifier.utils.common import *

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from typing import Tuple

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

from fakeNewsClassifier.utils.common import *
import warnings
warnings.filterwarnings('ignore')



In [12]:
class DataPreprocessing:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config

    def preprocess_data(self) -> None:
        """
        Main method to preprocess data
        """
        try:
            # Read raw data
            logger.info("Reading raw data")
            df = self._load_data()
            
            # Apply preprocessing steps
            logger.info("Applying preprocessing steps")
            X, y, tfidf = self._apply_preprocessing(df)
            
            # Split data
            logger.info("Splitting data into train and test sets")
            X_train, X_test, y_train, y_test = self._split_data(X, y)
            
            # Save processed data
    
            save_bin(X_train, Path(self.config.processed_x_train))
            save_bin(X_test, Path(self.config.processed_x_test))
            save_bin(y_train, Path(self.config.processed_y_train))
            save_bin(y_test, Path(self.config.processed_y_test))
            save_bin(tfidf, Path(self.config.tfidf_vectoriser))
            
            logger.info("Data preprocessing completed successfully")
            
        except Exception as e:
            logger.error(f"Error in data preprocessing: {str(e)}")
            raise e
        

    def _load_data(self) -> pd.DataFrame:
        """
        Load and combine fake and true news data
        
        Returns:
            pd.DataFrame: Combined raw data with labels
        """
        try:
            logger.info(f"Loading fake news data from {self.config.fake_news_path}")
            fake_df = pd.read_csv(self.config.fake_news_path)
            logger.info(f"Loaded {len(fake_df)} fake news articles")
            
            # Add label for fake news (1)
            fake_df["class"] = 1
            
            logger.info(f"Loading true news data from {self.config.true_news_path}")
            true_df = pd.read_csv(self.config.true_news_path)
            logger.info(f"Loaded {len(true_df)} true news articles")
            
            # Add label for true news (0)
            true_df["class"] = 0
            
            # Combine datasets
            combined_df = pd.concat([fake_df, true_df], ignore_index=True)
            combined_df = pd.concat([fake_df, true_df], axis=0).reset_index(drop=True)
            logger.info(f"Combined dataset with {len(combined_df)} articles")

            # Drop rows with missing text
            combined_df = combined_df.dropna(subset=['text'])
            logger.info(f"Delete NaN rows")
            
            # Shuffle data
            shuffled_df = combined_df.sample(frac=1, random_state=self.config.random_state).reset_index(drop=True)
            logger.info("Dataset shuffled")
            
            return shuffled_df
            
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise 


        
    def _apply_preprocessing(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Apply preprocessing steps to raw data
        
        Args:
            df (pd.DataFrame): Raw data
            
        Returns:
            pd.DataFrame: X_tfidf , y and vectoriser 
        """
        try:
            df.loc[df["class"] == 0, "text"] = df.loc[df["class"] == 0, "text"].str.replace(r"\(Reuters\)", "", regex=True)
            
            # Fill nulls and combine title and text
            df['text'] = df['text'].fillna('')
            df['title'] = df['title'].fillna('')
            df['content'] = df['title'] + ' ' + df['text']
            df['cleaned_text'] = df['content'].apply(clean_text)

            tfidf = TfidfVectorizer(max_features=5000)
            X = tfidf.fit_transform(df['cleaned_text'])
            y = df['class']

            # Return processed data
            return X, y, tfidf
            
        except Exception as e:
            logger.error(f"Error in preprocessing: {str(e)}")
            raise e
        
    
    def _split_data(self, X , y: pd.Series) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Split data into train and test sets
        
        Args:
            df (): Processed data
            
        Returns:
            Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: Train, validation and test data
        """
        try:
            if self.config.stratify:
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y,
                    test_size=self.config.test_size,
                    stratify=y,
                    random_state=self.config.random_state
                )
            else:
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y,
                    test_size=self.config.test_size,
                    random_state=self.config.random_state
                )
            
            return X_train, X_test, y_train, y_test
            
        except Exception as e:
            logger.error(f"Error in splitting data: {str(e)}")
            raise e

In [13]:
try:
    config = ConfigurationManager()
    data_preprocessing_config = config.get_data_preprocessing_config()
    data_preprocessing = DataPreprocessing(config=data_preprocessing_config)
    data_preprocessing.preprocess_data()
except Exception as e:
    raise e

[2025-05-12 21:20:18,727: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-12 21:20:18,758: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-12 21:20:18,767: INFO: common: created directory at: artifacts]
[2025-05-12 21:20:18,778: INFO: common: created directory at: artifacts/data_preprocessing]
[2025-05-12 21:20:18,783: INFO: 4076787911: Reading raw data]
[2025-05-12 21:20:18,787: INFO: 4076787911: Loading fake news data from artifacts/data_ingestion/Fake.csv]
[2025-05-12 21:20:20,345: INFO: 4076787911: Loaded 23481 fake news articles]
[2025-05-12 21:20:20,386: INFO: 4076787911: Loading true news data from artifacts/data_ingestion/True.csv]
[2025-05-12 21:20:21,242: INFO: 4076787911: Loaded 21417 true news articles]
[2025-05-12 21:20:21,277: INFO: 4076787911: Combined dataset with 44898 articles]
[2025-05-12 21:20:21,322: INFO: 4076787911: Delete NaN rows]
[2025-05-12 21:20:21,344: INFO: 4076787911: Dataset shuffled]
[2025-05-12 21:20:21,349: