In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-pairs/quora_train.csv
/kaggle/input/quora-pairs/quora_test.csv


In [2]:
## import packages

import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score, plot_confusion_matrix

In [3]:
## set directories and parameters

EMBEDDING_FILE = '../input/googles-trained-word2vec-model-in-python/GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = '../input/quora-pairs/quora_train.csv'
TEST_DATA_FILE = '../input/quora-pairs/quora_test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1


In [None]:
## process texts in datasets
print('Processing text dataset')

def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

texts_3 = [] 
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_3.append(text_to_wordlist(values[3], values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_3))

test_texts_3 = []
test_ids = []
test_labels = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_3.append(text_to_wordlist(values[3], values[4]))
        test_ids.append(values[0])
        test_labels.append(int(values[5]))
print('Found %s texts in test.csv' % len(test_texts_3))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_3 + test_texts_3)

sequences_3 = tokenizer.texts_to_sequences(texts_3)
test_sequences_3 = tokenizer.texts_to_sequences(test_texts_3)
#test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_3 = pad_sequences(sequences_3, maxlen=MAX_SEQUENCE_LENGTH)
#data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_3.shape)
print('Shape of label tensor:', labels.shape)

test_data_3 = pad_sequences(test_sequences_3, maxlen=MAX_SEQUENCE_LENGTH)
#test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)
test_labels = np.array(test_labels)

Processing text dataset


In [None]:
## sample train/validation data

#np.random.seed(123)
perm = np.random.permutation(len(data_3))
idx_train = perm[:int(len(data_3)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_3)*(1-VALIDATION_SPLIT)):]

data_3_train = data_3[idx_train]
#data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = labels[idx_train]

data_3_val = data_3[idx_val]
#data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = labels[idx_val]


In [None]:
import scipy as sp

#X_train = np.hstack((data_1_train, data_2_train))
#X_val = sp.sparse.hstack((data_1_val, data_2_val))
lr = LogisticRegression()
lr.fit(data_3_train, labels_train)


In [None]:
print('predicting ...')
y_pred = lr.predict(data_3_val)
loss = log_loss(labels_val,y_pred)
print('log_loss= {}'.format(loss))
accuracy = accuracy_score(labels_val,y_pred)
print('accuracy= {}'.format(accuracy))
auc = roc_auc_score(labels_val,y_pred)
print('auc  = {}'.format(auc))