In [4]:
import os




[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
%pwd

'c:\\Users\\ankit.rohilla\\Documents\\fake_news_classification\\research'

In [6]:
os.chdir("../")

In [7]:
%pwd

'c:\\Users\\ankit.rohilla\\Documents\\fake_news_classification'

In [8]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path


In [9]:
from text_classifier.constants import *
from text_classifier.utils.common import read_yaml, create_directories
import os
import pandas as pd
import torch
from tensorflow.python.ops.numpy_ops import np_utils
from transformers import BertModel, BertTokenizer
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name = config.tokenizer_name
        )

        return data_transformation_config


In [11]:
import os
from text_classifier.logging import logger
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import pickle

In [12]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.tokenizer = BertTokenizer.from_pretrained(config.tokenizer_name)

    def transform_data(self):
        fake_df = pd.read_csv(os.path.join(self.config.data_path, 'Fake.csv'), encoding='UTF-8')
        true_df = pd.read_csv(os.path.join(self.config.data_path, 'True.csv'), encoding='UTF-8')
        fake_df['label'] = 0
        true_df['label'] = 1
        df = pd.concat([fake_df, true_df]).reset_index()
        df.drop_duplicates(inplace=True)
        df['text'] = df['text'] + " " + df['title']
        df.drop(columns=['title', 'subject', 'date'])
        df = df.sample(frac=1).reset_index(drop=True)
        X_train, X_test, Y_train, Y_test = train_test_split(df['text'], df['label'], stratify = df['label'], test_size = 0.25, random_state =42)
        return (X_train, X_test, Y_train, Y_test)
    
    def convert_examples_to_features(self, X):  
        X = self.tokenizer(
            text = list(X),
            add_special_tokens = True,
            max_length = 120,
            truncation = True,
            padding = 'max_length',
            return_tensors = 'tf',
            return_token_type_ids = False,
            return_attention_mask = True,
            verbose = True
            )
        return X

    def convert(self):
        X_train, X_test, Y_train, Y_test = self.transform_data()
        train_encoding = self.convert_examples_to_features(X_train)
        test_encoding = self.convert_examples_to_features(X_test)
        train_encodings = {
            'X_train': train_encoding,
            'y_train': Y_train
        }
        test_encodings = {
            'X_test': test_encoding,
            'y_test': Y_test
        }

        # Save the dictionary to a file
        with open(os.path.join(self.config.root_dir,"train_encodings.pkl"), 'wb') as f:
            pickle.dump(train_encodings, f)
        with open(os.path.join(self.config.root_dir,"test_encodings.pkl"), 'wb') as f:
            pickle.dump(test_encodings, f)


In [13]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e

[2023-09-06 22:00:57,417: INFO: common: yaml file: config\config.yaml loaded successfully]
[2023-09-06 22:00:57,417: INFO: common: yaml file: params.yaml loaded successfully]
[2023-09-06 22:00:57,425: INFO: common: created directory at: artifacts]
[2023-09-06 22:00:57,425: INFO: common: created directory at: artifacts/data_transformation]
