In [12]:
%run PreProcessConfiguration.py # running confie file

In [13]:
from sklearn.feature_extraction import text
import re
import pandas as pd
import string
import re
from num2words import num2words

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from configparser import ConfigParser
import pickle

import enchant

import spacy
nlp = spacy.load('en_core_web_sm')


class PreProcess():

    def __init__(self,obj_dict):
        try:
            self.data=pd.read_csv(obj_dict['data'], encoding='latin1',header=0)
        except:
            print('Run first pre-processing configuration file (PreProcessConfiguration.py)')
            
        print(f"Original:{self.data.shape}")
        data_dd = self.data.drop_duplicates()
        dd = data_dd.reset_index(drop=True)
        print(f"Drop Dupicates:{dd.shape}")
        dd_dn = dd.dropna()
        df = dd_dn.reset_index(drop=True)
        print(f"Drop Nulls: {df.shape}")

        self.lowercase=False
        self.tokenized=False
        self.punctuations=False
        self.numberRemoval=False
        self.classes=list(df['sentiment_class'])
        self.reviews=list(df['text'])

        if obj_dict['negation_handling']=='True': # converts dont to do not
            self.negation_handling(obj_dict['appos'])
        if obj_dict['remove_punctuation']=='True':# removes #!~ kind of litterals
            self.remove_punctuations()
        if obj_dict['number_removal']=='True': # removes numbers from text
            self.number_removal()
        if obj_dict['numbers_to_names']=='True': # converts 10 to ten (number to their english name)
            self.numbers_to_names()
        if obj_dict['stop_words_removal']=='True': # removes stop words
            self.stop_words_removal()
        if obj_dict['gibberish_word_removal']=='True': # removes non english and noun words
            self.gibberish_word_removal()
        if obj_dict['lemmatization']=='True': # lemmatization of words
            self.lemmatization()
        if obj_dict['convert_to_lowercase']=='True': # converting to lowercase
            self.convert_to_lowercase()
            
        for i in range(len(self.reviews)): # removing extra spaces inthe reviews
            self.reviews[i] = re.sub(r"\s+", " ", self.reviews[i])
            
        # removing one letter words
        self.tokenize_text()
        temp=[]
        for i in range(len(self.reviews)):
            temp=[j for j in self.reviews[i] if len(j)>1]
            self.reviews[i]=' '.join(j for j in temp)
            temp=[]
        self.tokenized=False
        
        if obj_dict['tokenization']=='True': # tokenization
            self.tokenize_text()

        '''classes_df = pd.DataFrame(self.classes)
        reviews_df = pd.DataFrame(self.reviews)

        x=pd.concat([classes_df , reviews_df] , axis=1)
        x.columns=['class','review']
        x.to_csv('preprocessed_data.csv',index=False)'''
        
            
    def gibberish_word_removal(self):
        '''except for nouns removes words which are not in english dictionary'''
        
        standard_dict = enchant.Dict("en_US")
            # extracting each data row one by one
        for i in range(len(self.reviews)):
            tokenized = sent_tokenize(self.reviews[i])
            for j in tokenized:
                # Word tokenizers is used to find the words
                # and punctuation in a string
                wordsList = nltk.word_tokenize(j)
                # removing stop words from wordList
                # Using a Tagger. Which is part-of-speech
                # tagger or POS-tagger.
                tagged = nltk.pos_tag(wordsList)
                #print(tagged)         

            tokenizer = nltk.RegexpTokenizer(r"\w+")
            words_list = tokenizer.tokenize(self.reviews[i])
            for word in words_list:
                a = 0
                if word.isnumeric() == False:
                    #check if the word is a proper noun
                    for j in tagged:
                        if j[0] == word and (j[1] == 'NNP' or j[1] == 'NNPS'):
                            a = 1
                            break
                    if a != 1 and standard_dict.check(word) == False:
                        #get suggestions for the input word from the standard dictionary
                        self.reviews[i] = self.reviews[i].replace(word, ' ')
            
            self.tokenized=False

    def number_removal(self):
        '''removes numerics'''
        if self.numberRemoval==False:
            for i in range(len(self.reviews)):
                self.reviews[i] = re.sub(r'\w*\d\w*', ' ', self.reviews[i])
            self.numberRemoval=True

    def tokenize_text(self):
    
        if self.tokenized==False:
            for i in range(len(self.reviews)):
                self.reviews[i]=word_tokenize(self.reviews[i])
            self.tokenized=True

    def negation_handling(self,appos):

        ''' converts dont to do not'''
        self.convert_to_lowercase()
        self.remove_punctuations()
        for i in range(len(self.reviews)):
            for j in appos.keys():
                self.reviews[i]=self.reviews[i].replace(j+' ',appos[j]+' ')

    def convert_to_lowercase(self):

        '''converts the text in data to lowercase'''
        if self.lowercase==False:
            for i in range(len(self.classes)):
                self.classes[i]=self.classes[i].lower()
                self.reviews[i]=self.reviews[i].lower()
            self.lowercase=True

    def remove_punctuations(self):

        '''removes punctuations in text  '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' '''
        if self.punctuations==False:
            for i in range(len(self.reviews)):
                self.reviews[i] = self.reviews[i].translate(str.maketrans('', '', string.punctuation))
                self.punctuations=True

    def numbers_to_names(self):

        '''converts number(10) to its name(ten)'''
        if self.numberRemoval==False:
            self.convert_to_lowercase()
            for i in range(len(self.reviews)):
                m = re.findall(r'\d+', self.reviews[i])
                for j in m:
                    try:
                        x=num2words(j,lang='en_IN').replace('-',' ')
                        self.reviews[i]=self.reviews[i].replace(j,x)
                    except:
                        print(f'for {i} review number exceeded that limit of abs')
    
    def stop_words_removal(self):
        
        self.convert_to_lowercase()
        self.tokenize_text()
        
        my_stop_words = list(text.ENGLISH_STOP_WORDS) #importing stopwords
        
        #exclude the words from stop words which may effect sentiment of a sentence
        exempt=['against','below','cannot','cant','couldnt','cry','fire','hasnt','never','not','nobody','nor'
       ,'nothing','under','no','off','than']
        
        #include words which may be custom to your dataset
        my_stop_words.extend(['url','https','http','com'])

        for i in exempt:
            my_stop_words.remove(i)
            
        #removing stopwords
        for i in range(len(self.reviews)):
            self.reviews[i] = [w for w in self.reviews[i] if not w in my_stop_words]
            try:
                self.reviews[i].remove(' ')
            except:
                pass

        for i in range(len(self.reviews)):
            self.reviews[i]=' '.join(w for w in self.reviews[i])+' '
            
        self.tokenized=False

    def lemmatization(self):
        
        self.convert_to_lowercase()
        for i in range(len(self.reviews)):
            doc = nlp(self.reviews[i])
            # Create list of tokens from given string
            tokens = []
            for token in doc:
                tokens.append(token)

            self.reviews[i] = " ".join([token.lemma_ for token in doc])
        
        self.tokenized=False
      
    
        
if __name__ == '__main__':
    import os
    
    config = ConfigParser()
    config.read('PreProcess.ini')

    pfile = open('appos.pkl', 'rb')
    apos = pickle.load(pfile)
    pfile.close()
    
    obj_dict=dict()
    obj_dict["data"] =config['Data']['data']
    obj_dict["negation_handling"]=config['Data']['negation_handling']
    obj_dict["remove_punctuation"]=config['Data']['remove_punctuation']
    obj_dict["numbers_to_names"]=config['Data']['numbers_to_names']
    obj_dict["stop_words_removal"]=config['Data']['stop_words_removal']
    obj_dict["lemmatization"]=config['Data']['lemmatization']
    obj_dict["convert_to_lowercase"]=config['Data']['convert_to_lowercase']
    obj_dict["tokenization"]=config['Data']['tokenization']
    obj_dict["number_removal"]=config['Data']['number_removal']
    obj_dict["gibberish_word_removal"]=config['Data']['gibberish_word_removal']
    obj_dict['appos']=apos
    
    if not os.path.exists(obj_dict["data"].split('.')[0]+'_PreProcessed.csv'):
        obj=PreProcess(obj_dict)
        classes_df = pd.DataFrame(obj.classes)
        reviews_df = pd.DataFrame(obj.reviews)
        x=pd.concat([classes_df , reviews_df] , axis=1)
        x.columns=['sentiment_class','text']
        x.to_csv(obj_dict["data"].split('.')[0]+'_PreProcessed.csv',index=False)

Original:(24999, 2)
Drop Dupicates:(24901, 2)
Drop Nulls: (24901, 2)


KeyboardInterrupt: 