In [1]:
import numpy as np
import pandas as pd
from clean_question import Clean_Question
import os
import numpy as np
import pandas as pd
import string
import json
from sklearn.metrics import accuracy_score 
from keras.layers import Embedding
from keras.models import *
from keras.layers import *
from keras.optimizers import SGD
from keras.layers import Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.metrics import f1_score
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

In [2]:
# Loading Embeddings

embeddings_index = {}
f = open(r'embeddings/glove.840B.300d/glove.840B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.array([np.asarray(values[-300:], dtype='float32')])
    embeddings_index[word] = coefs
f.close()

In [3]:
# Load train data, split train data, create a new balanced training data set.

df = pd.read_csv('train.csv')
df_1 = df.loc[df['target'] == 1]
df_0 = df.loc[df['target'] == 0]
frames = [df_1, df_0]
result = pd.concat(frames)
df_balanced = result.sample(frac=1).reset_index(drop=True)
data = df_balanced['question_text'].values
labels = df_balanced['target'].values

In [4]:
# Cleaning data

clean_question = Clean_Question()
df_balanced['question_text'] = clean_question.clean_df (df_balanced["question_text"])
df_balanced.head()

Unnamed: 0,qid,question_text,target
0,4f5bd06fc5c0b6e52f99,why would you look shit up,0
1,7227f2a5d270bce15840,how do i create a whatsapp link,0
2,79224d84e10235f30d9c,what can or cannot be considered an intellectu...,0
3,57c9e9042a06f5659c42,what s the difference between meaning of life...,0
4,653ef5a73f46c6f982a0,how is the usa exporting it s misogyny to the ...,1


In [5]:
# Creating a tokenizer

MAX_NB_WORDS = 75000
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
word_index = tokenizer.word_index



In [6]:
# Split into test, train, validation.

MAX_SEQUENCE_LENGTH = 80
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
VALIDATION_SPLIT = 0.2
TEST_SPLIT = 0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
nb_test_samples = int(TEST_SPLIT * data.shape[0])

x_val = data[:nb_validation_samples]
y_val = labels[:nb_validation_samples]
x_test = data[nb_validation_samples:nb_validation_samples+nb_test_samples]
y_test = labels[nb_validation_samples:nb_validation_samples+nb_test_samples]
x_train = data[nb_validation_samples+nb_test_samples:]
y_train = labels[nb_validation_samples+nb_test_samples:]

In [7]:
# Creating an embedding matrix

EMBEDDING_DIM = 300
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [8]:
# Creating a NN and fitting with train data.

def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,                            
                            trainable=False))
    model_conv.add(Dropout(0.2))
    model_conv.add(Bidirectional(LSTM(100))) 
    model_conv.add(Dense(16, activation="relu"))
    model_conv.add(Dropout(0.1))
    model_conv.add(Dense(1, activation='sigmoid'))
    model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv

class_weight = {0: 1.,
                1: float(len(df_1)) / len(df_0)}

model_conv = create_conv_model()
model_conv.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=2, batch_size=512, class_weight = class_weight)

Train on 783674 samples, validate on 261224 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fa8a4025da0>

In [9]:
# Testing our model.

threshold = 0.9
preds = model_conv.predict(x_test)
y_pred = []
for i in preds:
    if i > threshold:
        y_pred.append(1)
    else:
        y_pred.append(0)
        
macro_f1 = f1_score(y_test, y_pred, average='macro')     
micro_f1 = f1_score(y_test, y_pred, average='micro')  
weighted_f1 = f1_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average=None)
accuracy = accuracy_score(y_test, y_pred)
print("Macro = %f, Micro = %f, Weighted = %f, F1 = %f, Accuracy = %f" % (macro_f1, micro_f1, weighted_f1, f1[0], accuracy))

Macro = 0.805915, Micro = 0.956141, Weighted = 0.955572, F1 = 0.976668, Accuracy = 0.956141


In [10]:
# Predicting classes for unclassified data.

df_test = pd.read_csv('test.csv')
df_test['question_text'] = clean_question.clean_df (df_test["question_text"])
y_pred_kaggle = []
test_questions = df_test['question_text'].values
sequences_pred_kaggle = tokenizer.texts_to_sequences(test_questions)
data_pred_kaggle = pad_sequences(sequences_pred_kaggle, maxlen=MAX_SEQUENCE_LENGTH)
kaggle_probabilities = model_conv.predict(data_pred_kaggle)

for i in kaggle_probabilities:
    if i > threshold:
        y_pred_kaggle.append(1)
    else:
        y_pred_kaggle.append(0)
        
df_test = df_test.drop(['question_text'], axis=1)
df_test['prediction'] = y_pred_kaggle
df_test.head()
export_csv = df_test.to_csv (r'submission.csv', index = None, header=True)