In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime


from nltk.corpus import wordnet
from nltk.corpus import stopwords
import nltk
import nltk.data
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import string
from langdetect import detect
import re
import spacy
from spacy.symbols import ORTH, NORM
from langdetect import detect_langs
import langid
from bs4 import BeautifulSoup

from wordcloud import WordCloud, STOPWORDS

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)


In [2]:
data = pd.read_csv('./dataset/dataset.csv',sep=",",nrows=10000)

In [3]:
def logIt(caller,s):
    dt = datetime.today().strftime('%Y-%m-%d %H:%M:%S')
    show_log = f'{caller: <25} ' + s
    print(dt + ' - ' + show_log)

In [4]:
"""
-----------------------------------------------------------------
Desc:   Only keep english records 
Input:  Dataset
Output: only english records dataset  

Traitements appliqués : 
- create new feature with language
- remove all records where title language is not english
-----------------------------------------------------------------
"""
def removeNonEngRecords(df,caller):
    
    language = []
    for title in data['title']:
        lang, log_prob = langid.classify(title)
        #print(lang)
        language.append(lang)
    
    df['lang'] = language
    
    df.drop(df[df.lang != "en"].index, inplace=True)
    df.drop(['lang'], inplace=True, axis=1)
    
    return df

In [5]:
"""
-----------------------------------------------------------------
Desc:   Merge des colonnes title et body
Input:  Dataset
Output: dataset with body feature merged with title   

Traitements appliqués : 
- remove 
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def mergeTitle(df, caller):
    
    
    df['final_body'] = df["title"] + ' ' + df["body"]
    df.drop(['body'], inplace=True, axis=1)
    df.columns = ['title','tags','body']
    
    return df

In [6]:
"""
-----------------------------------------------------------------
Desc:   Extract text from questions body 
Input:  Dataset
Output: dataset with body feature without tags   

Traitements appliqués : 
- extract text from tags contents
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def extractText(df, caller):
    bodies = []
    for body in data['body']: 
        #print(body)
        soup = BeautifulSoup(body)    
        tags = [tag.name for tag in soup.find_all()]
        #print(tags)
        #loop tag to remove code and href tags
        if "pre" in tags:
            soup.pre.decompose()
        if "code" in tags:
            try:
                soup.code.decompose()
            except:
                pass
        if "a" in tags:
            try:
                soup.a.decompose()
            except:
                pass
        bodies.append(soup.get_text())
        
    df['final_body'] = bodies
    df.drop(['body'], inplace=True, axis=1)
    df.columns = ['title', 'tags','body']
    
    return df
    

In [7]:
"""
-----------------------------------------------------------------
Desc:   Remove \n 
Input:  Dataset
Output: dataset with body feature without useless characters   

Traitements appliqués : 
- remove 
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def removeLineFeed(df, caller):
    bodies = []
    for body in data['body']: 
        final_body = body.replace('\n',' ')
        bodies.append(final_body)

    df['final_body'] = bodies
    df.drop(['body'], inplace=True, axis=1)
    df.columns = ['title', 'tags','body']
    
    return df
    

In [8]:
"""
-----------------------------------------------------------------
Desc:   Remove useless characters 
Input:  Dataset
Output: dataset with body feature without useless characters   

Traitements appliqués : 
- remove 
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def removeUselessChars(df, caller):
    bodies = []
    
    pattern = re.compile('[^A-Za-z +]')

    for body in data['body']: 
        final_body = pattern.sub(' ',body)        
        final_body = re.sub(pattern, ' ', body)
        bodies.append(final_body)

    df['final_body'] = bodies
    df.drop(['body'], inplace=True, axis=1)
    df.columns = ['title', 'tags','body']
    
    return df
    

In [9]:
"""
-----------------------------------------------------------------
Desc:   toLowerCase 
Input:  Dataset
Output: dataset with body feature to lower case   

Traitements appliqués : 
- remove 
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def toLowerCase(df, caller):
    bodies = []
    
    for body in data['body']: 
        final_body = body.lower()
        bodies.append(final_body)

    df['final_body'] = bodies
    df.drop(['body'], inplace=True, axis=1)
    df.columns = ['title', 'tags','body']
    
    return df

In [10]:
"""
-----------------------------------------------------------------
Desc:   unitary stopwords removal function 
Input:  text
Output: text without stopwords   

Traitements appliqués : 
- stop words removal 
-----------------------------------------------------------------
"""
def remove_stopwords(text):
    stop_words = set(stopwords.words('english')) 
    #print(text)
    word_tokens = word_tokenize(text) 

    filtered_text = ' '.join(w for w in word_tokens if not w in stop_words)
   
    return filtered_text
    

In [11]:
"""
-----------------------------------------------------------------
Desc:   Suppress stopwords, english corpus 
Input:  Dataset
Output: dataset with body feature without stopwords   

Traitements appliqués : 
- remove 
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def removeStopwords(df, caller):
    bodies = []

    df['no_stop_words'] = df.body.apply(remove_stopwords)    

    df.drop(['body'], inplace=True, axis=1)
    df.columns = ['title', 'tags','body']
    
    return df
  

In [12]:
"""
-----------------------------------------------------------------
Desc:   unitary Lemmatization & Stemming
Input:  text
Output: lemmatized and stemmed text   

Traitements appliqués : 
- lemmatization 
- stemming
-----------------------------------------------------------------
"""
def lemmatize_stem_text(text):
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    stemmer = EnglishStemmer()
    stemmed = ' '.join(stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) for w in w_tokenizer.tokenize(text))
    #print(stemmed)
    #return [stemmer.stem(WordNetLemmatizer().lemmatize(w, pos='v')) for w in w_tokenizer.tokenize(text)]
    return stemmed


In [13]:
"""
-----------------------------------------------------------------
Desc:   Lemmatization & Stemming
Input:  Dataset
Output: dataset with body feature normalized by lemmatization and stemming  

Traitements appliqués : 
- remove 
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def normalize(df, caller):
    bodies = []

    df['text_lemmatized'] = df.body.apply(lemmatize_stem_text)    
    
    df.drop(['body'], inplace=True, axis=1)
    df.columns = ['title', 'tags','body']
    
    return df

In [14]:
"""
-----------------------------------------------------------------
Desc:   remove Nan rows with empty body
Input:  Dataset
Output: dataset without Nan body feature   

Traitements appliqués : 
- valuate to empty string 
-----------------------------------------------------------------
"""
def removeNanBodies(df, caller):

    df['length'] = df.body.str.len()    
    df.drop(df[df.length == 0].index, inplace=True)
    df.drop(['length'], inplace=True, axis=1)       
    
    return df

In [15]:
"""
-----------------------------------------------------------------
Desc:   remove Nan rows with empty tag
Input:  Dataset
Output: dataset without Nan tags feature   

Traitements appliqués : 
- valuate to empty string 
-----------------------------------------------------------------
"""
def removeNanTags(df, caller):

    df['length'] = df.tags.str.len()    
    df.drop(df[df.length == 0].index, inplace=True)
    df.drop(['length'], inplace=True, axis=1)       
    
    return df

In [16]:
"""
-----------------------------------------------------------------
Desc:   normalize tags column 
Input:  Dataset
Output: dataset with tags feature ready to be used   

Traitements appliqués : 
- remove 
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def setupTags(df, caller):
    tags = []
    
    for tag in data['tags']: 
        final_tags = tag.replace('<',',')
        final_tags = final_tags.replace('>','')
        final_tags = final_tags[1:]
        
        tags.append(final_tags)

    df['final_tags'] = tags
    df.drop(['tags'], inplace=True, axis=1)
    df.columns = ['title', 'body', 'tags']
    
    return df

In [17]:
"""
-----------------------------------------------------------------
Desc:   normalize tags column 
Input:  Dataset
Output: dataset with tags feature ready to be used   

Traitements appliqués : 
- remove 
- create new column with extracted text
- 
-----------------------------------------------------------------
"""
def dropDuplicates(df, caller):
    bodies = []
    for body in data['body']: 
        t = set(body.split())
        final_body = ' '.join(t)        
        
        bodies.append(final_body)

    df['final_tags'] = bodies
    df.drop(['body'], inplace=True, axis=1)
    df.columns = ['title', 'tags','body']
    
    return df

In [18]:
"""
-----------------------------------------------------------------

Desc:   Pipeline de pre-processing du dataset
Input:  Dataframe
Output: pre-processed dataframe 

Traitements appliqués : 
- 

-----------------------------------------------------------------
"""  

def preprocessing_pipeline(data):

    logIt('[Remove non eng. rec]', 'Starting')
    data = removeNonEngRecords(data,'test')
    logIt('[Remove non eng. rec]', 'Completed')

    #logIt('[Merge Title]', 'Starting')
    #data = mergeTitle(data,'test')
    #logIt('[Merge Title]', 'Completed')
    
    logIt('[Extract text from HTML]', 'Starting')
    data = extractText(data, 'test')
    logIt('[Extract text from HTML]', 'Completed')
    
    logIt('[Remove line feed]', 'Starting')
    data = removeLineFeed(data,'test')
    logIt('[Remove line feed]', 'Completed')
    
    logIt('[Remove useless chars]', 'Starting')
    data = removeUselessChars(data,'test')
    logIt('[Remove useless chars]', 'Completed')
    
    logIt('[Lower]', 'Starting')
    data = toLowerCase(data,'test')
    logIt('[Lower]', 'Completed')
    
    logIt('[Remove Stop words]', 'Starting')
    data = removeStopwords(data,'test')
    logIt('[Remove Stop words]', 'Completed')
    
    logIt('[Lemmatization]', 'Starting')
    data = normalize(data,'test')
    logIt('[Lemmatization]', 'Completed')
    
    logIt('[Remove nan bodies]', 'Starting')
    data = removeNanBodies(data,'test')
    logIt('[Remove nan bodies]', 'Completed')

    logIt('[Remove nan tags]', 'Starting')
    data = removeNanTags(data,'test')
    logIt('[Remove nan tags]', 'Completed')
    
    logIt('[Setup Tags]', 'Starting')
    data = setupTags(data,'test')
    logIt('[Setup Tags]', 'Completed')
    
    #logIt('[Drop duplacate tokens]', 'Starting')
    #data = dropDuplicates(data,'test')
    #logIt('[Drop duplacate tokens]', 'Completed')

    logIt('[Drop title]', 'Starting')
    data.drop(['title'], inplace=True, axis=1)
    logIt('[Drop title]', 'Completed')
    
    logIt('[Save preprocessed dataset]', 'Starting')
    data.to_csv('./dataset/preprocessed_dataset.csv', index=False)
    logIt('[Save preprocessed dataset]', 'Completed')
    

In [19]:
preprocessing_pipeline(data)

2021-05-01 17:54:22 - [Remove non eng. rec]     Starting
2021-05-01 17:54:35 - [Remove non eng. rec]     Completed
2021-05-01 17:54:35 - [Extract text from HTML]  Starting
2021-05-01 17:54:38 - [Extract text from HTML]  Completed
2021-05-01 17:54:38 - [Remove line feed]        Starting
2021-05-01 17:54:38 - [Remove line feed]        Completed
2021-05-01 17:54:38 - [Remove useless chars]    Starting
2021-05-01 17:54:38 - [Remove useless chars]    Completed
2021-05-01 17:54:38 - [Lower]                   Starting
2021-05-01 17:54:38 - [Lower]                   Completed
2021-05-01 17:54:38 - [Remove Stop words]       Starting
2021-05-01 17:54:42 - [Remove Stop words]       Completed
2021-05-01 17:54:42 - [Lemmatization]           Starting
2021-05-01 17:54:49 - [Lemmatization]           Completed
2021-05-01 17:54:49 - [Remove nan bodies]       Starting
2021-05-01 17:54:49 - [Remove nan bodies]       Completed
2021-05-01 17:54:49 - [Remove nan tags]         Starting
2021-05-01 17:54:49 - [