## Import the necessary libraries

In [2]:
# Perform standard imports
from bs4 import BeautifulSoup
import re
from unidecode import unidecode
from word2number import w2n
import string
from itertools import groupby 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import spacy
import contractions
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
import pandas as pd
from itertools import chain
import textdistance
from tqdm import tqdm
from nltk import WordPunctTokenizer
from nltk.corpus import stopwords
import nltk
import csv
from ast import literal_eval as make_tuple


In [3]:
data=pd.read_csv('dataset/test.csv', escapechar="\\", quoting = csv.QUOTE_NONE)

# Text Preprocessing

## What is text preprocessing?

### To preprocess your text simply means to bring your text into a form that is predictable and analyzable for your task. A task here is a combination of approach and domain.

## Instruction for setup

### Virtual environment
- First go to commandline/terminal and make a virtual environment using virtualenv.
- Command for creating virtual environment: virtualenv name.
- Activate virtual environment: Linux/mac:-source name/bin/activate , windows:-name\Scripts\activate
- Deactivate virtual environment: deactivate


### install the following python libraries:
- jupyter
- beautifulsoup4
- nltk
- pandas
- tqdm


## Text Preprocessing which will be performed

- Remove duplicates
- Remove null values in titles
- Selecting 100 rows in random corresponding to each browse node id, and dropping rows whose browse node id frequency is less than 100.
- Converting the text to lowercase.
- Removal of any html markups.
- Removal of line breaks.
- Removal of website links.
- Convert non-ascii characters to ascii.
- Expand contractions.
- Remove digits from strings(optional).
- Remove all emojis.
- Remove redundant characters to 2 or 1(optional).
- Remove redundant punctuations to 1.
- Remove all punctuations except . , !.
- Remove all punctuations.
- Remove extra whitespace in between characters.
- Remove extra whitespace at the ends.
- Stemming(optional)
- Stop words removal(optional)

## Import Libraries

## Remove duplicates

In [7]:
def removeDups(s):
    """Function to remove any duplicate continuous punctuation and get into one"""
    punc = set(string.punctuation)
    newtext = []
    for k, g in groupby(s):
        if k in punc:
            newtext.append(k)
        else:
            newtext.extend(g)

    newtext=''.join(newtext)
    return(newtext)

## Remove emojis

In [8]:
def remove_emojis(text, remove=True):
    """To remove all emojis from the text"""
    if remove==True:
        emoji_pattern=re.compile("["
                         u"\U0001F600-\U0001F64F"
                         u"\U0001F300-\U0001F5FF"
                         u"\U0001F680-\U0001F6FF"
                         u"\U0001F1E0-\U0001F1FF"
                         "]+",flags=re.UNICODE)
        t=emoji_pattern.sub(r'',text).strip()
        return t
    else:
        return text

## Remove certain punctuations

In [9]:
def remove_certain_punctuation(text, remove=True):
    """to remove all punctuations except . ! ?"""
    if remove==True:
        text=re.sub("[^.,!'?a-zA-Z0-9 \n]",' ',text) 
    return text

## Remove all punctuations

In [10]:
def remove_all_punctuation(text, remove=False):
    """to remove all punctuations """
   
    if remove==True:
        text=re.sub("[^a-zA-Z0-9 \n]",' ',text) 

    return text

## Stemming

In [11]:
def stemming(text, apply=False):
    """Function to stem the words"""
    if apply==True:
        text1=[]
        text1.append(text)
        text1=''.join(text1).split()
        for i in range(len(text1)):
            text1[i]=s_stemmer.stem(text1[i])
        tex=' '.join(text1)
        return tex
    else:
        return text

## Remove redundant characters to 2 or 1

In [12]:
def remove_redun_to_2_or_1(text,choice=2,apply=True):
    if apply==True:
        if choice==1:
            repeat_pattern=re.compile(r'(\w)\1*')
            match_substitution=r'\1'
            new_word=repeat_pattern.sub(match_substitution,text)
            return new_word

        if choice==2:
            regr=re.compile(r'(a){3,}')
            regr0=re.compile(r'(b){3,}')
            regr1=re.compile(r'(c){3,}')
            regr2=re.compile(r'(d){3,}')
            regr3=re.compile(r'(e){3,}')
            regr4=re.compile(r'(f){3,}')
            regr5=re.compile(r'(g){3,}')
            regr6=re.compile(r'(h){3,}')
            regr7=re.compile(r'(i){3,}')
            regr8=re.compile(r'(j){3,}')
            regr9=re.compile(r'(k){3,}')
            regr10=re.compile(r'(l){3,}')
            regr11=re.compile(r'(m){3,}')
            regr12=re.compile(r'(n){3,}')
            regr13=re.compile(r'(o){3,}')
            regr14=re.compile(r'(p){3,}')
            regr15=re.compile(r'(q){3,}')
            regr16=re.compile(r'(r){3,}')
            regr17=re.compile(r'(s){3,}')
            regr18=re.compile(r'(t){3,}')
            regr19=re.compile(r'(u){3,}')
            regr20=re.compile(r'(v){3,}')
            regr21=re.compile(r'(w){3,}')
            regr22=re.compile(r'(x){3,}')
            regr23=re.compile(r'(y){3,}')
            regr24=re.compile(r'(z){3,}')
            new_word=re.sub(regr,"aa",text)
            new_word=re.sub(regr0,"bb",new_word)
            new_word=re.sub(regr1,"cc",new_word)
            new_word=re.sub(regr2,"dd",new_word)
            new_word=re.sub(regr3,"ee",new_word)
            new_word=re.sub(regr4,"ff",new_word)
            new_word=re.sub(regr5,"gg",new_word)
            new_word=re.sub(regr6,"hh",new_word)
            new_word=re.sub(regr7,"ii",new_word)
            new_word=re.sub(regr8,"jj",new_word)
            new_word=re.sub(regr9,"kk",new_word)
            new_word=re.sub(regr10,"ll",new_word)
            new_word=re.sub(regr11,"mm",new_word)
            new_word=re.sub(regr12,"nn",new_word)
            new_word=re.sub(regr13,"oo",new_word)
            new_word=re.sub(regr14,"pp",new_word)
            new_word=re.sub(regr15,"qq",new_word)
            new_word=re.sub(regr16,"rr",new_word)
            new_word=re.sub(regr17,"ss",new_word)
            new_word=re.sub(regr18,"tt",new_word)
            new_word=re.sub(regr19,"uu",new_word)
            new_word=re.sub(regr20,"vv",new_word)
            new_word=re.sub(regr21,"ww",new_word)
            new_word=re.sub(regr22,"xx",new_word)
            new_word=re.sub(regr23,"yy",new_word)
            new_word=re.sub(regr24,"zz",new_word)
        return new_word
    else:
        return text

## Word to number

In [13]:
def word_to_num(text, apply=False):
    if apply==True:
        text1=nlp(text)
        l=[]
        a=[]
        r=[]
        for sent in text1.sents:
            l.append(sent)



        for i in range(len(l)):
            for ent in l[i].ents:
                if ent.label_=='CARDINAL':
                    r.append(str(ent))
        a=r.copy()

        for i in range(len(r)):
            try:
                if r[i].isdigit()==False:
                    a[i]=str(w2n.word_to_num(r[i]))
            except ValueError as error:
                continue
        for i in range(len(r)):
            text=text.replace(r[i],a[i])
    return(text)

## Expand Contractions

In [14]:
def expand_contractions(text, contraction_mapping=contractions.CONTRACTION_MAP):
    """Function to perform contractions"""
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
#         expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
    expanded_text = contractions_pattern.sub(expand_match, text)
#     expanded_text = re.sub("'", "", expanded_text)
    return expanded_text       

## Remove stopwords

In [25]:
def stop_words_removal(text, apply=False):
    
    if apply==True:
        a=[]
        n1=[]
        b=[]
        a.append(text)
        n1=''.join(a).split()
        for i in range(len(n1)):
            if nlp.vocab[n1[i]].is_stop==False:
                b.append(n1[i])
        tex=' '.join(b)
        return(tex)
    else:
        return(text)

## Final preprocessing

In [26]:
def preprocess(text):
    """Function to perform all types of preprocessing required"""
      
    #to convert all code into lower case
    text=str(text).lower()
    # remove html markup
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text(separator=" ")
    regex=re.compile(r'[\n\r\t\xa0]')
    text=regex.sub(" ",text)
    # remove website links
    text=re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''','',text)
    #to convert the non-ascii characters to ascii
    text = unidecode(text)
    text=expand_contractions(text)
    #to remove strings with digits
    #text=re.sub(r'\w*\d\w*',' ',text)
    #to remove all emojis
    text=remove_emojis(text)
    text=remove_redun_to_2_or_1(text)
    #to remove duplicate punctuations
    text=removeDups(text)
    #to remove all punctuations except . ! ?
    text=remove_certain_punctuation(text)
    text=expand_contractions(text)
    #remove non-ascii and digits
    text=remove_all_punctuation(text)
    #to remove extra whitespace in between characters
    text=re.sub(' +', ' ',text)
    
    
    
    #to convert all word numbers to actual number
    text=word_to_num(text)
    #remove whitespace
    text=text.strip()
   
    text=stemming(text)
    text=stop_words_removal(text)
    
    #text=spell_correction(text)
    #to understand the sentiment probability
    return text
  
   

In [27]:
l1=data['TITLE'].tolist()
p_id=data['PRODUCT_ID'].tolist()

In [28]:
lists=[]
for i in tqdm(range(len(l1)), desc="progress"):
    lists.append(preprocess(l1[i]))  

progress: 100%|██████████████████████████████████████████████████████████████| 110775/110775 [01:21<00:00, 1362.60it/s]


In [30]:
svm_data=pd.DataFrame({'Product_id':p_id,'Titles':lists})

In [27]:
len(svm_data['Browse_node'].unique())

9919

In [31]:
svm_data.to_csv('Preprocessed test data for classification.csv')

In [6]:
svm_data=pd.read_csv('dataset/Preprocessed data for classification.csv')

In [7]:
tits=svm_data['Titles'].tolist()

In [8]:
for i in tqdm(range(len(tits)),desc='progress'):
    STOPWORDS = set(stopwords.words('english'))
    tits[i] = ' '.join(word for word in str(tits[i]).split() if word not in STOPWORDS)

progress: 100%|████████████████████████████████████████████████████████████| 2902953/2902953 [12:11<00:00, 3968.39it/s]


In [9]:
browse_id=svm_data['Browse_node'].tolist()

In [12]:
class_data=pd.DataFrame({'Titles':tits, 'Browse_node': browse_id})

In [13]:
class_data.to_csv('Preprocessed data for classification.csv')

In [14]:
class_data.drop_duplicates(subset='Titles',keep=False, inplace=True)
class_data.to_csv('Preprocessed data for classification.csv')

In [32]:
class_data=pd.read_csv('Preprocessed data for classification.csv')

In [14]:
de=class_data.groupby(by='Browse_node', as_index=False)['Titles'].count().rename(columns={"Titles": "frequency"}).sort_values('frequency',ascending=False)

In [19]:
finder=de[de['frequency']>100]
li=finder['Browse_node'].tolist()

In [51]:
df=class_data[class_data['Browse_node'].isin(li)]

In [57]:
der=df.groupby('Browse_node',as_index=False).apply(lambda x:x.sample(100)).reset_index(level=0, drop=True)

In [60]:
der.groupby(by='Browse_node', as_index=False)['Titles'].count().rename(columns={"Titles": "frequency"}).sort_values('frequency',ascending=False)

Unnamed: 0,Browse_node,frequency
0,0,100
1654,9580,100
1647,9502,100
1648,9512,100
1649,9520,100
...,...,...
827,2131,100
828,2137,100
829,2140,100
830,2143,100


## Observations:

After removing all the nodes which can lead to overfitting, we have around 2478 rows of browse ids and 247800 documents approximately