In [1]:
import os

In [2]:
pwd

'd:\\Data Science\\NLP\\SentimentAnalysis\\research'

In [3]:
os.chdir("../")


In [4]:
pwd

'd:\\Data Science\\NLP\\SentimentAnalysis'

In [5]:
import os
from pathlib import Path
from typing import Optional
import yaml
from dataclasses import dataclass

### b.	To be updated entity > constructor file (__init__.py)

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    clean_data_path: Path
    tokenizer_path: str
    

### c.	To be updated config > configuration.py file

In [7]:
from SentimentAnalysis.constants import *
from SentimentAnalysis.utils.common import read_yaml
from SentimentAnalysis.utils.common import create_directories

In [8]:
class ConfigurationManager:
    
    def __init__(
                self,
                config_file_path = CONFIG_FILE_PATH,
                params_file_path = PARAMS_FILE_PATH,
                schema_filepath = SCHEMA_FILE_PATH):
            

            self.config = read_yaml(config_file_path)
            self.params = read_yaml(params_file_path)
            self.schema = read_yaml(schema_filepath)

            create_directories([self.config.dataStore_root])

    def get_data_transformation_config(self)-> DataTransformationConfig:
          config = self.config.data_transformation

          create_directories([config.root_dir])
          
          data_transformation_config = DataTransformationConfig(
                root_dir=config.root_dir,
                clean_data_path=config.clean_data_path,
                tokenizer_path=config.tokenizer_path  
          )     
          return data_transformation_config 

In [9]:
import os
from SentimentAnalysis.logging import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [10]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
import joblib

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config
        self.top_words = 10000
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
        self.max_review_length = 300

    def preprocess_and_pad_sequences(self):
        df = pd.read_csv(self.config.clean_data_path)
        
        ##Split the data into training and test sets ratio 80% and 20%
        train, test, y_train, y_test = train_test_split(df['review'],
                                                        df['sentiment'],
                                                        test_size=0.2,
                                                        random_state=42)
        
        #np.save(os.path.join(self.config.root_dir, "train.npy"), train)
        
        print("train data shape", train.shape, "test data shape", test.shape)
        logger.info(train.head())

        # Building the Tokenizer
        self.tokenizer.fit_on_texts(train.tolist())
        joblib.dump(self.tokenizer, os.path.join(self.config.root_dir, self.config.tokenizer_path))
           

        print("#"*100)

        logger.info(print("token",self.tokenizer.word_index))

        X_train = self.tokenizer.texts_to_sequences(train.tolist())
        X_test = self.tokenizer.texts_to_sequences(test.tolist())

        def find_longest_sequence_length(X):
            # Use list comprehension to calculate the lengths of all sequences
            sequence_lengths = [len(sequence) for sequence in X]

            # Find the maximum length among all the sequences
            max_length = max(sequence_lengths)

            return max_length

        # Call the function to find maximum sequence lengths for train and test data
        max_length_X_train = find_longest_sequence_length(X_train)
        max_length_X_test = find_longest_sequence_length(X_test)

        # Print the result
        logger.info(print("Maximum sequence length for X_train:", max_length_X_train))
        logger.info(print("Maximum sequence length for X_test:", max_length_X_test))

        # Pre Pad training and test reviews
        X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, self.max_review_length, padding='pre')
        X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, self.max_review_length, padding='pre')
        
        # Save preprocessed data as CSV files (optional, you can adjust this based on your requirements)
        np.save(os.path.join(self.config.root_dir, "X_train.npy"), X_train)
        y_train.to_csv(os.path.join(self.config.root_dir, "y_train"), index=False)
        
        np.save(os.path.join(self.config.root_dir, "X_text.npy"), X_test)
        y_test.to_csv(os.path.join(self.config.root_dir, "y_test"), index=False)

        print("transformed data")
        logger.info(X_train[0:1])
               

In [11]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.preprocess_and_pad_sequences()
    
except Exception as e:
    raise e


[2023-08-12 14:22:23,608: INFO: common: YAML file loaded successfully: config\config.yaml]
[2023-08-12 14:22:23,612: INFO: common: YAML file loaded successfully: params.yaml]
[2023-08-12 14:22:23,614: INFO: common: YAML file loaded successfully: schema.yaml]
[2023-08-12 14:22:23,616: INFO: common: Created directory at: dataStore]
[2023-08-12 14:22:23,617: INFO: common: Created directory at: dataStore/data_transformation]
train data shape (24000,) test data shape (6000,)
[2023-08-12 14:22:23,943: INFO: 2567612510: 21753    ever sit movi like one big wtf welcom decoy an...
251      ok bought film woolworth friend joke present b...
22941    hilari great juli pat magnific mr miyagi refer...
618      237 high school student commit suicid shown ta...
17090    film reappear channel 13 1990s seri comedi hol...
Name: review, dtype: object]
####################################################################################################
[2023-08-12 14:22:29,151: INFO: 2567612510: None]
Maximu