In [1]:
import os

In [2]:
pwd

'd:\\Data Science\\NLP\\SentimentAnalysis\\research'

In [3]:
os.chdir("../")


In [4]:
pwd

'd:\\Data Science\\NLP\\SentimentAnalysis'

In [5]:
import os
from pathlib import Path
from typing import Optional
import yaml
from dataclasses import dataclass

### b.	To be updated entity > constructor file (__init__.py)

In [6]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class EDAdataCleanerConfig:
    root_dir: Path
    data_path: Path   

### c.	To be updated config > configuration.py file

In [7]:
from SentimentAnalysis.constants import *
from SentimentAnalysis.utils.common import read_yaml
from SentimentAnalysis.utils.common import create_directories

In [8]:
class ConfigurationManager:
    
    def __init__(
                self,
                config_file_path = CONFIG_FILE_PATH,
                params_file_path = PARAMS_FILE_PATH,
                schema_filepath = SCHEMA_FILE_PATH):
            

            self.config = read_yaml(config_file_path)
            self.params = read_yaml(params_file_path)
            self.schema = read_yaml(schema_filepath)

            create_directories([self.config.dataStore_root])

    def get_data_clean_config(self)-> EDAdataCleanerConfig:
          config = self.config.EDA_DataCleaner

          create_directories([config.root_dir])
          
          data_cleaner_config = EDAdataCleanerConfig(
                root_dir=config.root_dir,
                data_path=config.data_path,
                 
          )     
          return data_cleaner_config 

In [9]:
import os
from SentimentAnalysis.logging import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import SnowballStemmer
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')

import tensorflow as tf

##Setup the English stopwords
stopwords_list = stopwords.words('english')
nltk.download('wordnet')


class DataCleaner:
    def __init__(self, config: EDAdataCleanerConfig):
        self.config = config
        self.tokenizer = ToktokTokenizer()
        self.stopwords_list = set(stopwords.words('english'))
        self.stemmer = SnowballStemmer(language='english')
        self.lb = LabelBinarizer()

    def html(self, text):
        # Remove HTML tags from the text
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    def deEmojify(self, text):
        # Remove emojis from the text
        regrex_pattern = re.compile(pattern="["
                                      u"\U0001F600-\U0001F64F"  # emoticons
                                      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                      u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                      "]+", flags=re.UNICODE)
        return regrex_pattern.sub(r'', text)

    def to_unicode(self, text):
        # Convert text to unicode
        if isinstance(text, float):
            text = str(text)
        if isinstance(text, int):
            text = str(text)
        if not isinstance(text, str):
            text = text.decode('utf-8', 'ignore')
        return text

    def remove_between_square_brackets(self, text):
        # Remove text between square brackets
        return re.sub(r'\[[^]]*\]', '', text)

    def remove_special_characters(self, text, remove_digits=True):
        # Remove special characters and optionally digits
        pattern = r'[^a-zA-z0-9\s]' if remove_digits else r'[^a-zA-z\s]'
        text = re.sub(pattern, '', text)
        return text

    def denoise_text(self, text):
        # Denoise the text using multiple cleaning steps
        text = self.to_unicode(text)
        text = self.html(text)
        text = re.sub(r"http\S+", "", text)
        text = self.deEmojify(text)
        #text = text.encode('ascii', 'ignore')
        #text = text.to_unicode(text)
        text = self.remove_between_square_brackets(text)
        text = self.remove_special_characters(text)
        text = text.lower()
        return text

    def remove_stopwords(self, text, is_lower_case=False):
        # Remove stopwords from the text
        tokens = self.tokenizer.tokenize(text)
        tokens = [token.strip() for token in tokens]
        if is_lower_case:
            filtered_tokens = [token for token in tokens if token not in self.stopwords_list]
        else:
            filtered_tokens = [token for token in tokens if token.lower() not in self.stopwords_list]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text

    def apply_stemming(self, review):
        # Apply stemming to the text
        return ' '.join([self.stemmer.stem(word) for word in self.tokenizer.tokenize(review)])

    
    def binarize_sentiment(self, sentiment):
        # Binarize sentiment labels (Positive: 1, Negative: 0)
        return self.lb.fit_transform(sentiment)

    def convert_examples_to_features(self):
        data = pd.read_csv(self.config.data_path)
        # Assuming example_batch is a DataFrame with 'review' and 'sentiment' columns
        data['review'] = data['review'].apply(self.denoise_text)
        data['review'] = data['review'].apply(self.remove_stopwords)
        data['review'] = data['review'].apply(self.apply_stemming)
        data['sentiment'] = self.binarize_sentiment(data['sentiment'])

        
        data.to_csv(os.path.join(self.config.root_dir, "data.csv"), index=False)
        logger.info(data.shape)
        print(data.shape)
        logger.info(data.head)
        return data

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shanusingh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shanusingh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shanusingh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
try:
    config = ConfigurationManager()
    data_cleaner_config = config.get_data_clean_config()
    data_cleaner = DataCleaner(config=data_cleaner_config)
    data_cleaner.convert_examples_to_features()
        
except Exception as e:
    raise e


[2023-08-12 09:32:00,134: INFO: common: YAML file loaded successfully: config\config.yaml]
[2023-08-12 09:32:00,134: INFO: common: YAML file loaded successfully: params.yaml]
[2023-08-12 09:32:00,142: INFO: common: YAML file loaded successfully: schema.yaml]
[2023-08-12 09:32:00,142: INFO: common: Created directory at: dataStore]
[2023-08-12 09:32:00,142: INFO: common: Created directory at: dataStore/EDA_DataCleaner]


  soup = BeautifulSoup(text, "html.parser")


[2023-08-12 09:33:10,276: INFO: 1480734748: (30000, 2)]
(30000, 2)
[2023-08-12 09:33:10,276: INFO: 1480734748: <bound method NDFrame.head of                                                   review  sentiment
0      one review mention watch 1 oz episod youll hoo...          1
1      wonder littl product film techniqu unassum old...          1
2      thought wonder way spend time hot summer weeke...          1
3      basic there famili littl boy jake think there ...          0
4      petter mattei love time money visual stun film...          1
...                                                  ...        ...
29995  new york love final make shore 10 short stori ...          1
29996  movi make wish imdb would let vote zero one tw...          0
29997  space camp unfortun luck plan around time chal...          0
29998  octavio paz mexican poet writer diplomat recei...          1
29999  watch 10 minut movi bewild watch 30 minut toe ...          0

[30000 rows x 2 columns]>]
