<a href="https://colab.research.google.com/github/ahmettalhasen/Text-Similarity-LDA/blob/master/projectCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import time

import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')

from scipy.stats import entropy
import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style("darkgrid")

!pip install pyspellchecker # For correcting the spell mistakes
from spellchecker import SpellChecker

!pip install langdetect
from langdetect import detect 
from langdetect.lang_detect_exception import LangDetectException

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

# Importing the dataset
df = pd.read_csv('gdrive/My Drive/summer2019/support_forum_questions.csv',sep="|")

In [0]:
#Dropping unnecessary columns ad removing entries with na
df.dropna(axis = 0, inplace = True)
df.isnull().sum()
df = df.sample(frac=1.0)
df = df.reset_index(drop=True)
df = df.drop('login', axis=1)
df = df.drop('added', axis=1)

In [0]:
#Clearing Html function
def clearhtml(raw_html):
    """
    Function that cleans html code
    """
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    cleantext = re.sub('\n', ' ', cleantext)
    cleantext = re.sub('\r', ' ', cleantext)
    cleantext = re.sub('&nbsp', ' ', cleantext)
    return cleantext

def initial_clean(text):
    """
    Function that cleans emails, websites and any symbols/punctuations
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)|((\S+)?(.com)(\S+)?)", " ", text)
    text = text.lower() 
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"jotform", " form ", text)
    text = re.sub("[^a-zA-Z ]", " ", text)  
    return text

def first_preprocess(text):
    """
    Function that applies clearhtml and initial_clean
    """
    return initial_clean(clearhtml(text))

In [0]:
#Preprocess I
t1 = time.time()
df['question'] = df['question'].apply(first_preprocess)
df['details'] = df['details'].apply(first_preprocess)
df['quest'] = df['question'] + ' ' + df['details']
#Deleting the questions less than 30 character size --> They are all test entries or spams
df = df[df.quest.str.len() > 30 ]
t2 = time.time()
print("Time to clean Html", len(df), "articles:", (t2-t1)/60, "min")

In [0]:
#Filtering the non-english questions out

def filter_language(text):
    """
    Function that applies all three functions abov
    """
    try:
        return detect(text)
    except LangDetectException:
        pass
      
#Preprocess II     
t3 = time.time()     
df = df[df.quest.apply(filter_language) == 'en']
t4 = time.time()
print("Time to filtering non-english questions took ", (t4-t3)/60, "min")

In [0]:

def tokenizer(text):
    """
    Function that tokenizes words
    """
    text = nltk.word_tokenize(text)
    return text  
  
stop_words = stopwords.words('english')
def remove_stop_words_and_junk(text):
    """
    Function that removes all stopwords and undesired ones from text
    """
    #Deleting undesired words
    undesiredWords = ['would', 'hi', 'hello', 'thank', 'ive', 'havent', 'hasnt', 
                  'hadnt', 'arent', 'isnt', 'wouldnt', 'dont', 'werent', 
                  'couldnt', 'wont', 'cant', 'didnt', "doesnt", 'without',
                  'please','thanks', 'could']
    undesiredWords = set(undesiredWords)
    
    return [word for word in text if word not in stop_words and word not in undesiredWords]

def second_preprocess(text):
    """
    Function that tokenizes and removes stop words and junk some words
    """
    return remove_stop_words_and_junk(tokenizer(text))

In [0]:
#Preprocess III 
t5 = time.time()
df['tokenized'] = df['quest'].apply(second_preprocess)
t6 = time.time()
print("Time to tokenize and perfom the removals for", len(df), "questions took ", (t6-t5)/60, "min")

In [0]:
words = [word for row in list(df.tokenized) for word in row]
freqDist = FreqDist(words)

In [0]:
#Spell Checker is not used since this operation is too expensive
#Also this method corrupts some valuable words such as 'css'
spell = SpellChecker() 
def spelling_mistake_corrector(word):
    """
    Function that corrects the spelling mistake.
    Corrects if the number of occurences of the correct form is greater than
    the number of occurences of the original form in order to prevent miscorrection
    of some words.
    """
    checkedWord = spell.correction(word)
    if freqDist[checkedWord] >= freqDist[word]:
        word = checkedWord
    return word
  
def correctorForAll(text):
    """
    Function that applies spelling_mistake_corrector to all words
    """
    text = [spelling_mistake_corrector(word) for word in text]
    return text

#Option 1
#Since our purpose is to get a good topic distribution. Any of a word yields 
#to the same topic. Thus this gives better results
stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so all forms of a word is treated in the same way 
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] #filtering 1 and 2 letter words out
    except IndexError:
        pass
    return text

#Option2
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    """
    Function to lemmatize words.
    """
    text = [lemmatizer.lemmatize(word) for word in text]
    text = [word for word in text if len(word) > 2] #filtering 1 and 2 letter words out
    return text

def apply_corrector_and_lemmatizer(text):
    """
    This function applies all the functions above 
    """ 
    return lemmatize_words(correctorForAll(text))

In [0]:
#Preprocess IV
t7 = time.time()
df['tokenized'] = df['tokenized'].apply(stem_words)
t8 = time.time()
print("Time to stem words for ", len(df), " questions took", (t8-t7)/60, "min")

In [0]:
#Dropping the words with little length out
t9 = time.time()
df['quest_len'] = df['tokenized'].apply(lambda x: len(x))
MIN_TOKEN_NUMBER = 9
df = df[df['quest_len'] > MIN_TOKEN_NUMBER]
t10 = time.time()
print("Time to drop out the questions with few words took ", (t10-t9)/60, "min")

In [0]:
df = df.reset_index(drop=True)

In [0]:
def csv_formater(line):
    """
    Function to convrt an array into a string by putting comma between words.
    """
    strr = ""
    for word in line:
        strr = strr + word + ","
    return strr

In [0]:
t11 = time.time()
df['words'] = df['tokenized'].apply(csv_formater)
t12 = time.time()
print("Time to make the file ready to store took ", (t12-t11)/60, "min")

Time to make the file ready to store took  0.019699708620707194 min


In [0]:
df.head(3)

Unnamed: 0,id,question,details,quest,tokenized,quest_len,words
0,1262197,i upgraded to my account but form is still sho...,i used jot form from to and then we...,i upgraded to my account but form is still sho...,"[upgrad, account, form, still, show, quota, er...",29,"upgrad,account,form,still,show,quota,error,mes..."
1,180811,why does captcha not work first time,whenever i go to my website on which i have a...,why does captcha not work first time whenev...,"[captcha, work, first, time, whenev, go, websi...",25,"captcha,work,first,time,whenev,go,websit,form,..."
2,147439,is it possible to fix this formatting issue,hi we have successfuly created this logo ...,is it possible to fix this formatting issue ...,"[possibl, fix, format, issu, successfuli, crea...",20,"possibl,fix,format,issu,successfuli,creat,logo..."


In [0]:
# Saving the latest verison as csv for practical use in modelling
export_csv = df.to_csv ('gdrive/My Drive/summer2019/cleanData.csv', index = None, header=True)

In [0]:
d = pd.read_csv('gdrive/My Drive/summer2019/cleanData.csv')


In [0]:
d.head(3)

Unnamed: 0,id,question,details,quest,tokenized,quest_len,words
0,1262197,i upgraded to my account but form is still sho...,i used jot form from to and then we...,i upgraded to my account but form is still sho...,"['upgrad', 'account', 'form', 'still', 'show',...",29,"upgrad,account,form,still,show,quota,error,mes..."
1,180811,why does captcha not work first time,whenever i go to my website on which i have a...,why does captcha not work first time whenev...,"['captcha', 'work', 'first', 'time', 'whenev',...",25,"captcha,work,first,time,whenev,go,websit,form,..."
2,147439,is it possible to fix this formatting issue,hi we have successfuly created this logo ...,is it possible to fix this formatting issue ...,"['possibl', 'fix', 'format', 'issu', 'successf...",20,"possibl,fix,format,issu,successfuli,creat,logo..."
