<a href="https://colab.research.google.com/github/ahmettalhasen/Text-Similarity-LDA/blob/master/projectSupportCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%matplotlib inline
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import gensim
from gensim.models import LdaModel
from gensim import models, corpora, similarities
import re
from nltk.stem.porter import PorterStemmer
import time
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer 
from scipy.stats import entropy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
nltk.download('punkt')
nltk.download('stopwords')

!pip install pyspellchecker # For correcting the spell mistakes
from spellchecker import SpellChecker
!pip install langdetect
from langdetect import detect 
from langdetect.lang_detect_exception import LangDetectException

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

# Importing the dataset
df = pd.read_csv('gdrive/My Drive/summer2019/support_forum_questions.csv',sep="|")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
#Replacing Nans in 'tags' with 
# df.loc[df.tags.isnull()]['tags'] = ' '

In [0]:
df.dropna(axis = 0, inplace = True)
df.isnull().sum()

df = df.sample(frac=1.0)
df = df.reset_index(drop=True)

df = df.drop('login', axis=1)
df = df.drop('added', axis=1)

In [0]:
import re

#Clearing Html function
def clearhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    cleantext = re.sub('\n', ' ', cleantext)
    cleantext = re.sub('\r', ' ', cleantext)
    cleantext = re.sub('&nbsp', ' ', cleantext)
    return cleantext

def initial_clean(text):
    """
    Function that cleans emails, websites and any symbols/punctuations
    """
    text = re.sub("((\S+)?(http(s)?)(\S+))|((\S+)?(www)(\S+))|((\S+)?(\@)(\S+)?)", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"jotform", " form ", text)
    text = re.sub("[^a-zA-Z ]", " ", text)
    text = text.lower() 
    return text

def first_preprocess(text):
    """
    Function that applies clearhtml and initial_clean
    """
    return initial_clean(clearhtml(text))

In [0]:
#Preprocessing
t1 = time.time()
df['question'] = df['question'].apply(first_preprocess)
df['details'] = df['details'].apply(first_preprocess)
df['quest'] = df['question'] + ' ' + df['details']
#Deleting the questions less than 30 character size --> They are all test entries or spams
df = df[df.quest.str.len() > 30 ]
t2 = time.time()
print("Time to clean Html", len(df), "articles:", (t2-t1)/60, "min")

Time to clean Html 227526 articles: 28.61263000567754 min


In [0]:
#Filtering the non-english questions out

def filter_language(text):
    """
    Function that applies all three functions abov
    """
    try:
        return detect(text)
    except LangDetectException:
        pass
      
t3 = time.time()     
df = df[df.quest.apply(filter_language) == 'en']
t4 = time.time()
print("Time to filtering non-english questions took ", (t4-t3)/60, "min")

Time to filtering non-english questions took  21.599160651365917 min


In [0]:
### Getting text tokenized and ready to process ###

def tokenizer(text):
    text = nltk.word_tokenize(text)
    return text  
  
stop_words = stopwords.words('english')
def remove_stop_words_and_junk(text):
    """
    Function that removes all stopwords and undesired ones from text
    """
    #Deleting undesired words
    undesiredWords = ['would', 'hi', 'hello', 'thank', 'ive', 'havent', 'hasnt', 
                  'hadnt', 'arent', 'isnt', 'wouldnt', 'dont', 'werent', 
                  'couldnt', 'wont', 'cant', 'didnt', "doesnt", 'without',
                  'please','thanks', 'could']
    undesiredWords = set(undesiredWords)
    
    return [word for word in text if word not in stop_words and word not in undesiredWords]

def second_preprocess(text):
    """
    Function that tokenizes and removes stop words and junk some words
    """
    return remove_stop_words_and_junk(tokenizer(text))

In [0]:
t5 = time.time()
df['tokenized'] = df['quest'].apply(second_preprocess)
t6 = time.time()
print("Time to tokenize and perfom the removals for", len(df), "questions took ", (t6-t5)/60, "min")

In [0]:
words = [word for row in list(df.tokenized) for word in row]
freqDist = FreqDist(words)

In [0]:
spell = SpellChecker() 
def spelling_mistake_corrector(word):
    """
    Function that corrects the spelling mistake.
    Corrects if the number of occurences of the correct form is greater than
    the number of occurences of the original form in order to prevent miscorrection
    of some words.
    """
    checkedWord = spell.correction(word)
    if freqDist[checkedWord] >= freqDist[word]:
        word = checkedWord
    return word
  
def correctorForAll(text):
    """
    Function that applies spelling_mistake_corrector to all words
    """
    text = [spelling_mistake_corrector(word) for word in text]
    return text

#Option 1
stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words, so all forms of a word is treated in the same way 
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 2] #filtering 1 and 2 letter words out
    except IndexError:
        pass
    return text

#Option2
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    """
    Function to lemmatize words.
    """
    text = [lemmatizer.lemmatize(word) for word in text]
    text = [word for word in text if len(word) > 2] #filtering 1 and 2 letter words out
    return text

def apply_corrector_and_lemmatizer(text):
    """
    This function applies all the functions above 
    """ 
    return lemmatize_words(correctorForAll(text))

In [0]:
t7 = time.time()
df['tokenized'] = df['tokenized'].apply(stem_words)
t8 = time.time()
print("Time to stem words for ", len(df), " questions took", (t8-t7)/60, "min")

Time to correct and lemmatize for  201609  questions took 2.9812321186065676 min


In [0]:
t9 = time.time()
df['quest_len'] = df['tokenized'].apply(lambda x: len(x))
MIN_TOKEN_NUMBER = 9
df = df[df['quest_len'] > MIN_TOKEN_NUMBER]
t10 = time.time()
print("Time to drop the questions with few words took ", (t10-t9)/60, "min")

Time to drop the questions with few words took  0.003864320119222005 min


In [0]:
df = df.reset_index(drop=True)

In [0]:
def csv_formater(line):
    strr = ""
    for word in line:
        strr = strr + word + ","
    return strr

In [0]:
t11 = time.time()
df['words'] = df['tokenized'].apply(csv_formater)
t12 = time.time()
print("Time to make the file ready to store took ", (t12-t11)/60, "min")

Time to make the file ready to store took  0.01765124797821045 min


In [0]:
df

Unnamed: 0,id,question,details,quest,tokenized,quest_len,words
0,679412,previously viewed folder keeps opening or bein...,hi the my forms view keeps reverting to an...,previously viewed folder keeps opening or bein...,"[previous, view, folder, keep, open, rememb, e...",43,"previous,view,folder,keep,open,rememb,even,nav..."
1,1236066,notification e mails not sent when responses a...,i just sent out a jot form to class of an...,notification e mails not sent when responses a...,"[notif, mail, sent, respons, edit, sent, jot, ...",40,"notif,mail,sent,respons,edit,sent,jot,form,cla..."
2,1577426,account is out of space,hello we received a message saying that our...,account is out of space hello we received a...,"[account, space, receiv, messag, say, account,...",15,"account,space,receiv,messag,say,account,space,..."
3,1786109,unable to open the pictures from the email on ...,i have a couple of questions regarding a form...,unable to open the pictures from the email on ...,"[unabl, open, pictur, email, phone, coupl, que...",42,"unabl,open,pictur,email,phone,coupl,question,r..."
4,207876,hint text in large text boxes gets incorrectly...,hello it seems that if i put example text ...,hint text in large text boxes gets incorrectly...,"[hint, text, larg, text, box, get, incorrectli...",39,"hint,text,larg,text,box,get,incorrectli,save,u..."
5,300290,my account is suspended,hello please unlock me my login is ltime...,my account is suspended hello please unl...,"[account, suspend, unlock, login, ltime, said,...",25,"account,suspend,unlock,login,ltime,said,fish,w..."
6,82591,submissions,if i am on the free account and i reach s...,submissions if i am on the free account a...,"[submiss, free, account, reach, submiss, old, ...",15,"submiss,free,account,reach,submiss,old,submiss..."
7,1662804,i contacted your company before to please canc...,i believe i contacted your company before to ...,i contacted your company before to please canc...,"[contact, compani, cancel, account, refund, pa...",18,"contact,compani,cancel,account,refund,payment,..."
8,1282259,why after i edit the payment using payapl t...,it was working before the edit all i have ...,why after i edit the payment using payapl t...,"[edit, payment, use, payapl, save, form, work,...",36,"edit,payment,use,payapl,save,form,work,work,ed..."
9,1376107,clone version has a different form layout,hi i am trying to create a clone of a prio...,clone version has a different form layout hi...,"[clone, version, differ, form, layout, tri, cr...",29,"clone,version,differ,form,layout,tri,creat,clo..."


In [0]:
words = [word for row in list(df.tokenized) for word in row]
freqDist = FreqDist(words)

In [0]:
df

Unnamed: 0,id,question,details,quest,tokenized,quest_len,words
0,679412,previously viewed folder keeps opening or bein...,hi the my forms view keeps reverting to an...,previously viewed folder keeps opening or bein...,"[previous, view, folder, keep, open, rememb, e...",43,"previous,view,folder,keep,open,rememb,even,nav..."
1,1236066,notification e mails not sent when responses a...,i just sent out a jot form to class of an...,notification e mails not sent when responses a...,"[notif, mail, sent, respons, edit, sent, jot, ...",40,"notif,mail,sent,respons,edit,sent,jot,form,cla..."
2,1577426,account is out of space,hello we received a message saying that our...,account is out of space hello we received a...,"[account, space, receiv, messag, say, account,...",15,"account,space,receiv,messag,say,account,space,..."
3,1786109,unable to open the pictures from the email on ...,i have a couple of questions regarding a form...,unable to open the pictures from the email on ...,"[unabl, open, pictur, email, phone, coupl, que...",42,"unabl,open,pictur,email,phone,coupl,question,r..."
4,207876,hint text in large text boxes gets incorrectly...,hello it seems that if i put example text ...,hint text in large text boxes gets incorrectly...,"[hint, text, larg, text, box, get, incorrectli...",39,"hint,text,larg,text,box,get,incorrectli,save,u..."
5,300290,my account is suspended,hello please unlock me my login is ltime...,my account is suspended hello please unl...,"[account, suspend, unlock, login, ltime, said,...",25,"account,suspend,unlock,login,ltime,said,fish,w..."
6,82591,submissions,if i am on the free account and i reach s...,submissions if i am on the free account a...,"[submiss, free, account, reach, submiss, old, ...",15,"submiss,free,account,reach,submiss,old,submiss..."
7,1662804,i contacted your company before to please canc...,i believe i contacted your company before to ...,i contacted your company before to please canc...,"[contact, compani, cancel, account, refund, pa...",18,"contact,compani,cancel,account,refund,payment,..."
8,1282259,why after i edit the payment using payapl t...,it was working before the edit all i have ...,why after i edit the payment using payapl t...,"[edit, payment, use, payapl, save, form, work,...",36,"edit,payment,use,payapl,save,form,work,work,ed..."
9,1376107,clone version has a different form layout,hi i am trying to create a clone of a prio...,clone version has a different form layout hi...,"[clone, version, differ, form, layout, tri, cr...",29,"clone,version,differ,form,layout,tri,creat,clo..."


In [0]:
freqDist['contac']

In [0]:
## Saving the latest verison as csv for practical use
export_csv = df.to_csv ('gdrive/My Drive/summer2019/cleanData.csv', index = None, header=True)

In [0]:
d = pd.read_csv('gdrive/My Drive/summer2019/cleanData.csv')


In [0]:
d

Unnamed: 0,id,question,details,quest,tokenized,quest_len,words
0,679412,previously viewed folder keeps opening or bein...,hi the my forms view keeps reverting to an...,previously viewed folder keeps opening or bein...,"['previous', 'view', 'folder', 'keep', 'open',...",43,"previous,view,folder,keep,open,rememb,even,nav..."
1,1236066,notification e mails not sent when responses a...,i just sent out a jot form to class of an...,notification e mails not sent when responses a...,"['notif', 'mail', 'sent', 'respons', 'edit', '...",40,"notif,mail,sent,respons,edit,sent,jot,form,cla..."
2,1577426,account is out of space,hello we received a message saying that our...,account is out of space hello we received a...,"['account', 'space', 'receiv', 'messag', 'say'...",15,"account,space,receiv,messag,say,account,space,..."
3,1786109,unable to open the pictures from the email on ...,i have a couple of questions regarding a form...,unable to open the pictures from the email on ...,"['unabl', 'open', 'pictur', 'email', 'phone', ...",42,"unabl,open,pictur,email,phone,coupl,question,r..."
4,207876,hint text in large text boxes gets incorrectly...,hello it seems that if i put example text ...,hint text in large text boxes gets incorrectly...,"['hint', 'text', 'larg', 'text', 'box', 'get',...",39,"hint,text,larg,text,box,get,incorrectli,save,u..."
5,300290,my account is suspended,hello please unlock me my login is ltime...,my account is suspended hello please unl...,"['account', 'suspend', 'unlock', 'login', 'lti...",25,"account,suspend,unlock,login,ltime,said,fish,w..."
6,82591,submissions,if i am on the free account and i reach s...,submissions if i am on the free account a...,"['submiss', 'free', 'account', 'reach', 'submi...",15,"submiss,free,account,reach,submiss,old,submiss..."
7,1662804,i contacted your company before to please canc...,i believe i contacted your company before to ...,i contacted your company before to please canc...,"['contact', 'compani', 'cancel', 'account', 'r...",18,"contact,compani,cancel,account,refund,payment,..."
8,1282259,why after i edit the payment using payapl t...,it was working before the edit all i have ...,why after i edit the payment using payapl t...,"['edit', 'payment', 'use', 'payapl', 'save', '...",36,"edit,payment,use,payapl,save,form,work,work,ed..."
9,1376107,clone version has a different form layout,hi i am trying to create a clone of a prio...,clone version has a different form layout hi...,"['clone', 'version', 'differ', 'form', 'layout...",29,"clone,version,differ,form,layout,tri,creat,clo..."
