In [5]:
import pandas as pd
import numpy as np
import nltk
import re
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def load_data(path):
    train = pd.read_csv(path+'/train.csv')
    test = pd.read_csv(path+'/test.csv')
    y = train['is_duplicate']
    return train, test


def fill_missing_values(train, test):
    # Check for any null values
    print(train.isnull().sum())
    print(test.isnull().sum())
    
    # We find 2 null values in train and test both
    # Replace them with an 'empty' string
    train = train.fillna('empty')
    test = test.fillna('empty')
    return train, test

    
def clean_text(text, remove_stopwords=True, stemming=False):
    
    
     # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"what's", "", text)
    text = re.sub(r"What's", "", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"I'm", "I am", text)
    text = re.sub(r" m ", " am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"60k", " 60000 ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e-mail", "email", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"quikly", "quickly", text)
    text = re.sub(r" usa ", " America ", text)
    text = re.sub(r" USA ", " America ", text)
    text = re.sub(r" u s ", " America ", text)
    text = re.sub(r" uk ", " England ", text)
    text = re.sub(r" UK ", " England ", text)
    text = re.sub(r"india", "India", text)
    text = re.sub(r"switzerland", "Switzerland", text)
    text = re.sub(r"china", "China", text)
    text = re.sub(r"chinese", "Chinese", text) 
    text = re.sub(r"imrovement", "improvement", text)
    text = re.sub(r"intially", "initially", text)
    text = re.sub(r"quora", "Quora", text)
    text = re.sub(r" dms ", "direct messages ", text)  
    text = re.sub(r"demonitization", "demonetization", text) 
    text = re.sub(r"actived", "active", text)
    text = re.sub(r"kms", " kilometers ", text)
    text = re.sub(r"KMs", " kilometers ", text)
    text = re.sub(r" cs ", " computer science ", text) 
    text = re.sub(r" upvotes ", " up votes ", text)
    text = re.sub(r" iPhone ", " phone ", text)
    text = re.sub(r"\0rs ", " rs ", text) 
    text = re.sub(r"calender", "calendar", text)
    text = re.sub(r"ios", "operating system", text)
    text = re.sub(r"gps", "GPS", text)
    text = re.sub(r"gst", "GST", text)
    text = re.sub(r"programing", "programming", text)
    text = re.sub(r"bestfriend", "best friend", text)
    text = re.sub(r"dna", "DNA", text)
    text = re.sub(r"III", "3", text) 
    text = re.sub(r"the US", "America", text)
    text = re.sub(r"Astrology", "astrology", text)
    text = re.sub(r"Method", "method", text)
    text = re.sub(r"Find", "find", text) 
    text = re.sub(r"banglore", "Banglore", text)
    text = re.sub(r" J K ", " JK ", text)
    
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Remove stop words
    if remove_stopwords:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Shorten words to their stems
    if stemming:
        text = text.split()
        stemmer = PorterStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)


train, test = load_data('../data')
train, test = fill_missing_values(train, test)

print(train.isnull().sum())
print(test.isnull().sum())

id              0
qid1            0
qid2            0
question1       0
question2       2
is_duplicate    0
dtype: int64
test_id      0
question1    2
question2    4
dtype: int64
id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64
test_id      0
question1    0
question2    0
dtype: int64


In [7]:
stop_words = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']

In [8]:
def process_questions(question_list, questions, question_list_name, dataframe):
    '''transform questions and display progress'''
    for idx, question in enumerate(questions):
        question_list.append(clean_text(question, remove_stopwords=True, stemming=True))
        if len(question_list) % 100000 == 0:
            progress = len(question_list)/len(dataframe) * 100
            print("{} is {}% complete.".format(question_list_name, round(progress, 1)))            

In [9]:
train_question2 = []
process_questions(train_question2, train.question2, 'train_question2', train)

train_question2 is 0.0% complete.
train_question2 is 0.0% complete.
train_question2 is 0.0% complete.
train_question2 is 0.0% complete.


In [10]:
train_question1 = []
process_questions(train_question1, train.question1, 'train_question1', train)

train_question1 is 0.0% complete.
train_question1 is 0.0% complete.
train_question1 is 0.0% complete.
train_question1 is 0.0% complete.


In [11]:
test_question1 = []
process_questions(test_question1, test.question1, 'test_question1', test)

test_question2 = []
process_questions(test_question2, test.question2, 'test_question2', test)

test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question1 is 0.0% complete.
test_question2 is 0.0% complete.
test_question2 is 0.0% complete.
test_question2 is 0.0% complete.
test_question2 is 0.0% complete.
test_question2 is 0.0% complete.
test_question2 is 0.0% complete.
test_question2 is 0.0% complete.
test_quest

In [15]:
# append these clean questions back to the original dataframes

train_q1_clean = pd.Series(train_question1)
train_q2_clean = pd.Series(train_question2)
test_q1_clean = pd.Series(test_question1)
test_q2_clean = pd.Series(test_question2)

train = pd.concat([train, train_q1_clean], axis = 1)
train = pd.concat([train, train_q2_clean], axis = 1)

test = pd.concat([test, test_q1_clean], axis = 1)
test = pd.concat([test, test_q2_clean], axis = 1)


In [17]:
print train


            id    qid1    qid2  \
0            0       1       2   
1            1       3       4   
2            2       5       6   
3            3       7       8   
4            4       9      10   
5            5      11      12   
6            6      13      14   
7            7      15      16   
8            8      17      18   
9            9      19      20   
10          10      21      22   
11          11      23      24   
12          12      25      26   
13          13      27      28   
14          14      29      30   
15          15      31      32   
16          16      33      34   
17          17      35      36   
18          18      37      38   
19          19      39      40   
20          20      41      42   
21          21      43      44   
22          22      45      46   
23          23      47      48   
24          24      49      50   
25          25      51      52   
26          26      53      54   
27          27      55      56   
28          28