In [None]:
!pip install contractions
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer, WordNetLemmatizer
import contractions 
from collections import defaultdict	
import numpy as np
import pandas as pd
import math


Collecting contractions
  Downloading contractions-0.1.66-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 11.7 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 28.1 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=85435 sha256=d6aad7f07f7cd45acd8e0dc120d18913372bc59d7e4fd66d634b07bba5684b7d
  Stored in directory: /root/.cache/pip/wheels/25/19/a6/8f363d9939162782bb8439d886469756271abc01f76fbd790f
Successfully built pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully instal

In [None]:
from sys import base_prefix


class Text_preprocessor:
 """
    A class to preprocess text for NLP Application.

    ...

    Attributes
    ----------
    spec_filtered : list
      list with special character removed
    rm_stopwrds : list
      list with stopwords removed
    clean : list
      list after all text preprocessing
    index_word : dict
      dict of indexed words
    bag_of_words : list
      list of words after preprocessing document
    corpus_words: list
      list of all words in corpus




    Methods
    -------
    expand_contraction(text=""):
        returns the expanded text.

    remove_special_characters(text="")
        returns text with removed emailaddress, special characters and numbers

    tokenize(text=")
        returns list of words from text
    
    removal_stop_words(token=[],language='english')
        stop words are derived from  nltk.corpus
        returns the list with removed stopwords for english language
    
    stem_or_lem(token=[],method="stemm")
        return the list after lemmitization or stemmization depending upon 
        method argument
    
    preprocessed_text(text="")
        returns list of words after performing
        contaraction, removal of special characters, tokenization, removal of
        stop word, stemmization and lemmitization

    bow(documents)
        returns bag of words for a document(sentence)

    get_corpus_words(text)
        return list of all  words of corpus 

    get_doc_list
        return list of document after text-preprocessing

    get_word_dict
        returns key value pair for words in document

    computeTF
        return dict of computed TF for documents

    computeTFIDF
        returns dict of IDFS for words in corpus

    computeTFIDF
        returns dict of computed TFIDF for words in documents
    """

 
 def __init__(self):
   self.spec_filtered = []
   self.rm_stopwrds = []
   self.clean = []
   self.index_word = {}
   self.bag_of_words = []
   self.corpus_words = []

 def expand_contraction(self, text:str)->str:
    '''
    Expands the words in text with contractions module.

        Parameters
        ----------
        text : str,
            text to be expanded

        Returns
        -------
        text with expanded words 
    '''
    # create an empty list
    expanded_words = []    
    for word in text.split():
      # using contractions.fix to expand the shotened words and removes extra spaces
      expanded_words.append(contractions.fix(word))   
    expanded_text = ' '.join(expanded_words)
    return expanded_text


 def remove_special_characters(self, text:str)->str:
      '''
    Removes the email, special character and numbers from text.

        Special character includes ! @ # $ & * () + -.

        Parameters
        ----------
        text : str
            String containing special character

        Returns
        -------
        String without email, special character and numbers
    '''
      # remove email if any
      txt_email = re.compile(r'[A-Za-z0-9]*@[A-Za-z]*\.com')
      cln_txt = txt_email.sub('', text)
      # remove special character and number if any
      self.spec_filtered = re.sub('[^A-Za-z]+', ' ', cln_txt)      
      return self.spec_filtered

 def tokenize(self,text:str)->list:
   '''
     Tokenize the text  to form list.
        Use nltk.word_tokenize.

        Parameters
        ----------
        text : str, 
        text to  tokenize
            
        Returns
        -------
        list of tokenized words
    '''

   nltk_tokens = nltk.word_tokenize(text)
   return nltk_tokens


 def removal_stop_words(self,tokens:list, language:str='english')->list:
   '''
    Removes the stop words from list.

        Use stopwords from nltk.corpus.

        Parameters
        ----------
        token : list
            words token
        language : str, optional 
        Language of the words (default is english) 

        Returns
        -------
        list of words without stop words
    '''
   stopword_list = nltk.corpus.stopwords.words(language)
   self.rm_stopwrds = [word for word in tokens if not word in stopword_list]
   return self.rm_stopwrds

 def stem_or_lem(self, tokens:list,method:str)->list:
   '''
    Perform Stemming or lemmatization.
    If the argument method is 'stemm' then performs stemmization, performs
    lemmitization if 'lemm' and return tokens for mismatched strings
    PorterStemmer  from nltk for stemming
    WordNetLemmatizer from nltk for lemmatization

        Parameters
        ----------
        tokens : list
           list of tokenized words

        Returns
        -------
        return words after Stemming or Lemmatization
    '''
   #instance of PorterStemmer 
   ps = PorterStemmer()
   stemmed=[]
   lemmed=[]
   if method =='stemm':
    for w in tokens:
        rootWord=ps.stem(w)
        stemmed.append(rootWord)
    return stemmed
   elif method =='lemm':
     wordnet_lemmatizer = WordNetLemmatizer()
     for w in tokens:
        lemm = wordnet_lemmatizer.lemmatize(w)
        lemmed.append(lemm)
     return lemmed
   else:
      return tokens

 def preprocessed_text(self,text:str)->list:
    '''
    Perfoms all the operation of text preprocessing.


        Parameters
        ----------
        text : str, 
            string to be preprocessed
        Returns
        -------
        returns list of words after performing
        contaraction, removal of special characters, tokenization, removal of
        stop word, stemmization and lemmitization

    '''
    exp_text=self.expand_contraction(text)
    prune_special=self.remove_special_characters(exp_text)
    tokenize_words=self.tokenize(prune_special)
    remove_stopwords=self.removal_stop_words(tokenize_words,'english')
    stemmed =self.stem_or_lem(remove_stopwords,'stemm')
    self.clean =self.stem_or_lem(stemmed,'lemm')
    return self.clean

 def bow(self,sentence:str):
   ''' 
   Create bag of words for a sentence

      Parameters
      ----------
      sentence: str
        sentence to form bag of words

    returns  bags of words for a sentence
      
   '''
   self.bag_of_words = self.preprocessed_text(sentence)
   return self.bag_of_words
 
 def get_corpus_words(self,text:list)->dict:
    ''' 
    Creates list with all words from corpus

        Parameter
        --------
        text : list
          list of strings

        Returns
        -------
         returns list  of words in corpus
    '''
    i = 0
    # iterate through list of sentences
    for sent in text:
      clean_text = self.preprocessed_text(sent)
      # iterate through words in sentence
      for word in clean_text:
        # unique words
        if word not in self.corpus_words:
          self.corpus_words.append(word)
    return self.corpus_words

 def get_doc_list(self, text:list):
    '''
    Returns the list of all documents in the corpus

      Parameter
      ---------
      text : list 
        list of documents before preprocessing

      Returns
      -------
        list of documents after preprocessing
   '''
    arr = []
    for sent in text:
      bow = self.bow(sent)
      # create dict for bag of words
      word_dict = self.get_word_dict(text, bow)
      # add the bow to list
      arr.append(word_dict)
    return arr

 def get_word_dict(self,text, bow):
   '''
    Returns the dict for bow

      Parameter
      ---------
      text : list 
        list of documents before preprocessing

      bow : list 
         list of words of a documents

      Returns
      -------
        (key ,value) 
        Key : word
        value : frequency of words in document
   '''
   
   all_words = self.get_corpus_words(text)
  #  words dictionary with value 0
   wordDict = dict.fromkeys(all_words, 0) 
   for word in bow:
    # count occurence of each words 
    wordDict[word] += 1
   return  wordDict
   
 def computeTF(self, wordDict,bow):
      '''
      Computes TF for each word in  BOW

      Parameter
      ---------
        WordDict : dict 
          key : words
          value: occurence of words in document

          bow: list
           Bag of words for the document

      Returns
      --------
        dict
        key : word
        value : term frequency of the word

      '''
      tfDict = {}
      bowCount = len(bow)
      print("bowCount",bowCount)
      for word, count in wordDict.items():
        # term frequency for words in a sentence
          tfDict[word] = count/float(bowCount)
      print(tfDict)
      return tfDict

 def computeIDF(self, docList):
    '''
      Computes IDFs for all word in doclist

      Parameter
      ---------
        doclist : list  
          list of pre-processed documents

      Returns
      --------
        dict
        key : word
        value : IDFS for each word

      '''
    idfDict = {}
    N = len(docList)
    # dict with all words with value 0(template)
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                # increase the value if the word exist in doc
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
    return idfDict

 def computeTFIDF(self, tfBOW, idfs):
    '''
    Computes TFIDFs for all word in documents

      Parameter
      ---------
        tfBow : dict  
          Key: word
          value: tf of word in document

        idfs: dict
          key : Word
          value: idf of word in document list 
        

      Returns
      --------
        dict
        key : word
        value : TfIDFS for each word in document

    
    '''
    tfidf = {}
    for word, val in tfBOW.items():
        tfidf[word] = val*idfs[word]
    return tfidf
 
 def dict_to_df(self, text):
   '''
   Performs all above operations to 
   Generates pandas dafarame
   

      Parameter
      ---------
        text : list of documents (corpus)  
                  

      Returns
      --------
        Pandas dataframe
    '''
   arr = []
   doclist = self.get_doc_list(text)
   idfs = self.computeIDF(doclist)
   for sent in text:
     bow = self.bow(sent)
     word_dict = self.get_word_dict(text,bow)
     tfBOW = self.computeTF(word_dict,bow)
    #  print("tfBOW",tfBOW)
     tfidf = self.computeTFIDF(tfBOW,idfs)
     arr.append(tfidf)
   df = pd.DataFrame(arr)
   return df
  
text2 =['This is a good movie.',
      'It is a good movie, but you know good is relative.',
      'Movie is fun to watch.',
      'I had a good relaxing time.',
      'The whole cinema experience was good.',
      'This is a good cinema.']
 
  
obj = Text_preprocessor()
print(obj.dict_to_df(text2))
obj.dict_to_df(text2)






bowCount 3
{'thi': 0.3333333333333333, 'good': 0.3333333333333333, 'movi': 0.3333333333333333, 'It': 0.0, 'know': 0.0, 'rel': 0.0, 'fun': 0.0, 'watch': 0.0, 'I': 0.0, 'relax': 0.0, 'time': 0.0, 'the': 0.0, 'whole': 0.0, 'cinema': 0.0, 'experi': 0.0}
bowCount 6
{'thi': 0.0, 'good': 0.3333333333333333, 'movi': 0.16666666666666666, 'It': 0.16666666666666666, 'know': 0.16666666666666666, 'rel': 0.16666666666666666, 'fun': 0.0, 'watch': 0.0, 'I': 0.0, 'relax': 0.0, 'time': 0.0, 'the': 0.0, 'whole': 0.0, 'cinema': 0.0, 'experi': 0.0}
bowCount 3
{'thi': 0.0, 'good': 0.0, 'movi': 0.3333333333333333, 'It': 0.0, 'know': 0.0, 'rel': 0.0, 'fun': 0.3333333333333333, 'watch': 0.3333333333333333, 'I': 0.0, 'relax': 0.0, 'time': 0.0, 'the': 0.0, 'whole': 0.0, 'cinema': 0.0, 'experi': 0.0}
bowCount 4
{'thi': 0.0, 'good': 0.25, 'movi': 0.0, 'It': 0.0, 'know': 0.0, 'rel': 0.0, 'fun': 0.0, 'watch': 0.0, 'I': 0.25, 'relax': 0.25, 'time': 0.25, 'the': 0.0, 'whole': 0.0, 'cinema': 0.0, 'experi': 0.0}
bowCoun

Unnamed: 0,thi,good,movi,It,know,rel,fun,watch,I,relax,time,the,whole,cinema,experi
0,0.15904,0.026394,0.100343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.026394,0.050172,0.129692,0.129692,0.129692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.100343,0.0,0.0,0.0,0.259384,0.259384,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.019795,0.0,0.0,0.0,0.0,0.0,0.0,0.194538,0.194538,0.194538,0.0,0.0,0.0,0.0
4,0.0,0.015836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15563,0.15563,0.095424,0.15563
5,0.15904,0.026394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15904,0.0


In [None]:

bow = obj.bow(text2[0])
bow


['thi', 'good', 'movi']

In [None]:
word_dict = obj.get_word_dict(text2,bow)
word_dict


{'I': 0,
 'It': 0,
 'cinema': 0,
 'experi': 0,
 'fun': 0,
 'good': 1,
 'know': 0,
 'movi': 1,
 'rel': 0,
 'relax': 0,
 'the': 0,
 'thi': 1,
 'time': 0,
 'watch': 0,
 'whole': 0}

In [None]:
tfBOW = obj.computeTF(word_dict,bow)


bowCount 3
{'thi': 0.3333333333333333, 'good': 0.3333333333333333, 'movi': 0.3333333333333333, 'It': 0.0, 'know': 0.0, 'rel': 0.0, 'fun': 0.0, 'watch': 0.0, 'I': 0.0, 'relax': 0.0, 'time': 0.0, 'the': 0.0, 'whole': 0.0, 'cinema': 0.0, 'experi': 0.0}


In [None]:
doclist = obj.get_doc_list(text2)
idfs = obj.computeIDF(doclist)
idfs

{'I': 0.7781512503836436,
 'It': 0.7781512503836436,
 'cinema': 0.47712125471966244,
 'experi': 0.7781512503836436,
 'fun': 0.7781512503836436,
 'good': 0.07918124604762482,
 'know': 0.7781512503836436,
 'movi': 0.3010299956639812,
 'rel': 0.7781512503836436,
 'relax': 0.7781512503836436,
 'the': 0.7781512503836436,
 'thi': 0.47712125471966244,
 'time': 0.7781512503836436,
 'watch': 0.7781512503836436,
 'whole': 0.7781512503836436}

In [None]:
tfidf = obj.computeTFIDF(tfBOW,idfs)
tfidf

{'I': 0.0,
 'It': 0.0,
 'cinema': 0.0,
 'experi': 0.0,
 'fun': 0.0,
 'good': 0.026393748682541605,
 'know': 0.0,
 'movi': 0.10034333188799373,
 'rel': 0.0,
 'relax': 0.0,
 'the': 0.0,
 'thi': 0.15904041823988746,
 'time': 0.0,
 'watch': 0.0,
 'whole': 0.0}