## Code for cleaning text

In [None]:
##Reference: https://www.kaggle.com/currie32/the-importance-of-cleaning-text
##Reference: https://www.kaggle.com/life2short/data-processing-replace-abbreviation-of-word

In [1]:
import numpy as np
import pandas as pd
import nltk
import re

from collections import defaultdict
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer,WordNetLemmatizer
from string import punctuation
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

## Splitting data into train and test set and saving them in seperate files

In [2]:
df = pd.read_csv("./Dataset/train.csv")
df = df.fillna('empty') ## If any of the question is na, replace it with "empty"

In [18]:
stop_words = set(stopwords.words('english'))
tag_map = defaultdict(lambda : "n")
tag_map['J'],tag_map['V'],tag_map['R'] = "a","v","r"
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

replacements=[(r"\b([A-Za-z]+)'s\b", '\\1 is'),(r"\b([A-Za-z]+)'re\b", '\\1 are'),
              (r"\b([A-Za-z]+)'ve\b", '\\1 have'),(r"\b([A-Za-z]+)'ll\b", '\\1 will'),
              (r"\b([A-Za-z]+)n't\b", '\\1 not'),
              ("whats","what is"),("whos","who is"),("wheres","where is"),
              ("whens","when is"),("hows","how is"),(" im ","i am"),
              ("hes","he is"),("shes","she is"),("thats","that is"),
              ("theres","there is"),("isnt","is not"),("wasnt","was not"),
              ("arent","are not"),("werent","were not"),("cant","can not"),
              ("cannot","can not"),("couldnt","could not"),("dont","do not"),
              ("didnt","did not"),("shouldnt","should not"),("wouldnt","would not"),
              ("doesnt","does not"),("havent","have not"),("hasnt","has not"),
              ("hadnt","had not"),
              ('\s+',' '), # replace multi space with one single space
              (" J K ", " JK "),("banglore", "Banglore"),("bangalore", "Banglore"),("bengaluru", "Banglore"),
              ("Find", "find"), ("Method", "method"),("Astrology", "astrology"),
              ("bestfriend", "best friend"),(" bf ","boy friend"),(" gf "," girl friend "),
              ("boyfriend"," boy friend "),("girlfriend","girl friend"),
              ("programing", "programming"),("calender", "calendar"),("intially", "initially"), ("quikly", "quickly"),
              ("imrovement", "improvement"),("demonitization", "demonetization"),(" dms ", "direct messages "),
              ("upvote", "up vote"),(" downvotes ", " up votes "),
              ("ios", "operating system"),(" iPhone ", " phone "),(" iphone ", " phone "),(" i phone ", " phone "),
              (" cs ", " computer science "),(" cse ", " computer science "),(" CS ", " computer science "),
              (" CSE ", " computer science "),
              ("KMs", " kilometers "),("kms", " kilometers "),("actived", "active"),
              (" UK ", " England "),(" uk ", " England "),(" u s ", " America "),(" USA ", " America "),
              (" US "," America "),("the US", "America"),(" usa ", " America "),
              ("e-mail", "email"),(" 9 11 ", "911"),(" b g ", " bg "),("60k", " 60000"),
              ('₹',' rupee '), (' txt '," text "),(" OS "," operating system "), ("Wi-Fi", "wifi"),
              ("cgpa","gpa"),("watsapp","whatsapp"),("tution", "tuition"),
              (" II ", " two "),(" III ", " three "),(" V ", " five "),
              ("1st"," one "),("2nd"," two "),("3rd"," three "),("4th"," four "),(" 10th "," ten "),(" 12th "," twelve "),
              (" 00 "," 0 "),(" 000 "," 0 "),(" 0000 "," 0 "),(" 0 "," zero "),
              (" 1 "," one "),(" 01 "," one "),(" 2 "," two "),(" 3 "," three "),(" 4 "," four "),
              (" 10 "," ten "),(" 20 "," twenty "),(" 50 "," fifty "),(" 100 "," hundred "),(" 1000 "," thousand "),
              (r"\0rs ", " rs "),(r"\'s", " "),(r"\'ve", " have "),(r"\'d", " would "),(r"\'ll", " will "),
              (r"\0s", "0"),(r"\s{2,}", " "),(r"[^A-Za-z0-9]", " ")
             ]

In [7]:
def clean_text(text, to_lowercase=True, remove_stop_words=False, lemmatize=True, stem_words=False):
    
    ## Replace old patterns with new
    for old,new in replacements:
        text= re.sub(old,new, text)
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])   
    
    # Convert to lowercase
    if to_lowercase:
        text=text.lower()

    text = text.split()
    
    # Lemmatize words
    if lemmatize:
        text = [ lemmatizer.lemmatize(word,tag_map[tag[0]]) for word,tag in pos_tag(text) ]
        
    # Remove stop words
    if remove_stop_words:
        text = [w for w in text if not w in stop_words] 
    
    # Shorten words to their stems
    if stem_words:
        text = [stemmer.stem(word) for word in text]
    
    text = " ".join(text)

    # Return the clean text as string
    return(text)

## clean_text("He swam and ran stupidly before being caught",True,True,True,False)

In [214]:
## Will take some time to run, close to an hour
df['question1']=df['question1'].apply(lambda x:clean_text(x,True,True,True,False))
df['question2']=df['question2'].apply(lambda x:clean_text(x,True,True,True,False))

In [None]:
a = 270000 
for i in range(a,a+20):
    print(clean_text(df.question1[i],True,True,True,False))
    print(clean_text(df.question2[i],True,True,True,False))
    print()

In [None]:
X, y = df[['question1', 'question2']], df['is_duplicate']
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=777, stratify=y)
X_train['is_duplicate']=y_train
X_train.to_csv('train.csv', index=False)
X_test['is_duplicate']=y_test
X_test.to_csv('test.csv', index=False)

In [3]:
df_train = pd.read_csv('train.csv',encoding = "ISO-8859-1")
df_test = pd.read_csv('test.csv',encoding = "ISO-8859-1")
X_train, y_train  = df_train[['question1', 'question2']], df_train['is_duplicate']
X_test, y_test  = df_test[['question1', 'question2']], df_test['is_duplicate']

In [27]:
X_test.shape

(80858, 2)

In [28]:
y_train.value_counts(dropna=False)

0    204022
1    119410
Name: is_duplicate, dtype: int64

In [29]:
y_test.value_counts(dropna=False)

0    51005
1    29853
Name: is_duplicate, dtype: int64