In [1]:
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

In [2]:
TRAIN_DATA_FILE = 'train.csv'
TEST_DATA_FILE = 'test.csv'
MAX_SEQUENCE_LENGTH = 100
MAX_NB_WORDS = 200000

import nltk
nltk.download("stopwords")

# import pandas as pd
# df_train = pd.read_csv(TRAIN_DATA_FILE)
# df_test = pd.read_csv(TEST_DATA_FILE)
# print(df_train.columns)
# print(df_test.columns)

In [3]:
def pad_sequences(sequences, MAX_SEQUENCE_LENGTH = None, value = 0):
    if MAX_SEQUENCE_LENGTH is None:
        MAX_SEQUENCE_LENGTH = max(map(len, sequences))
    for sequence in sequences:
        while len(sequence) < MAX_SEQUENCE_LENGTH:
            sequence.append(value);
    return np.asarray(sequences);

def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

Found 404290 texts in train.csv
Found 2345796 texts in test.csv


In [23]:
import itertools
import numpy as np

splitted_texts_1 = list(map(lambda x : x.split(), texts_1))
splitted_texts_2 = list(map(lambda x : x.split(), texts_2))
splitted_test_texts_1 = list(map(lambda x : x.split(), test_texts_1))
splitted_test_texts_2 = list(map(lambda x : x.split(), test_texts_2))

all_tokens = itertools.chain.from_iterable(splitted_texts_1 + splitted_texts_2 + splitted_test_texts_1 + splitted_test_texts_2)
# print('Found %s unique tokens' % len(set(all_tokens)))
word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}
print('Found %s unique tokens' % len(word_to_id.keys()))
# convert token lists to token-id lists, e.g. [[1, 2], [2, 2]] here
sequences_1 = [[word_to_id[token] for token in sequence] for sequence in splitted_texts_1]
sequences_2 = [[word_to_id[token] for token in sequence] for sequence in splitted_texts_2]
test_sequences_1 = [[word_to_id[token] for token in sequence] for sequence in splitted_test_texts_1]
test_sequences_2 = [[word_to_id[token] for token in sequence] for sequence in splitted_test_texts_2]

data_1 = pad_sequences(sequences_1)
data_2 = pad_sequences(sequences_2)
labels = np.array(labels)
one_hot_labels = np.zeros((len(labels), 2))
one_hot_labels[np.arange(len(labels)), labels] = 1

print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', one_hot_labels.shape)

test_data_1 = pad_sequences(test_sequences_1)
test_data_2 = pad_sequences(test_sequences_2)
test_ids = np.array(test_ids)

Found 94456 unique tokens
Shape of data tensor: (404290, 60)
Shape of label tensor: (404290, 2)


In [33]:
import pickle
pickle.dump(data_1, open(r"data_1.pickle", "wb"))
pickle.dump(data_2, open(r"data_2.pickle", "wb"))
pickle.dump(one_hot_labels, open(r"one_hot_labels.pickle", "wb"))
pickle.dump(test_data_1, open(r"test_data_1.pickle", "wb"))
pickle.dump(test_data_2, open(r"test_data_2.pickle", "wb"))
pickle.dump(test_ids, open(r"test_ids.pickle", "wb"))

In [37]:
!ls -l data_1.pickle

-rw-rw-r-- 1 ubuntu ubuntu 194059363 Apr 26 19:29 data_1.pickle


True

In [26]:
test_data_2.shape

(2345796, 119)

In [32]:
max(list(map(len, splitted_test_texts_1)))

118