In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv


In [3]:
train_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/train.csv')
test_data = pd.read_csv('/kaggle/input/quora-insincere-questions-classification/test.csv')

In [4]:
train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data

Unnamed: 0,qid,question_text,target
0,52ab8f18e83f981f9465,"Can I make it to symbiosis, Noida for BBA Llb ...",0
1,22c218e03d22de56d761,What are valve specifications for Hyundai acce...,0
2,e4a3f7ff57cfe331db10,What don't you like about working for Apple?,0
3,9ab5f17e9b4c7656db24,Can one join Google after doing CLAT from one ...,0
4,d865b47b83e9c2476e6c,Why are there so many counterregulatory hormon...,0
...,...,...,...
1306117,905024cd845162042a99,Why are some people anti-Semitic?,0
1306118,aa795bd172731ffc60c2,How do I overcome the trauma of watching Boku ...,0
1306119,52529547e7b290e735f5,In the 15th century when there were no means o...,0
1306120,451fce5080e02b928d18,How can I approach my parents about my sexuality?,0


In [5]:
test_data.isnull().sum()

qid              0
question_text    0
dtype: int64

In [6]:
DATA_SET_SIZE = 100000

In [7]:
# train_data = train_data
train_data = train_data[:DATA_SET_SIZE]

In [8]:
train_sentences = train_data['question_text'][:].values
train_targets = train_data['target'].values
test_qid = test_data['qid'].values
test_sentences = test_data['question_text'].values

In [9]:
len(train_targets)

100000

In [10]:
def remove_stopwords(sentence):
    # List of stopwords
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
    
    # Sentence converted to lowercase-only
    sentence = sentence.lower()

    words = sentence.split()
    no_words = [w for w in words if w not in stopwords]
    sentence = " ".join(no_words)

    return sentence

train_sentences_no_stopwards = train_sentences.copy()
for i in range(0, len(train_sentences_no_stopwards)):
    train_sentences_no_stopwards[i] = remove_stopwords(train_sentences_no_stopwards[i])

In [11]:
max_len = 0
for i in train_sentences_no_stopwards:
    max_len = max(len(i.split()), max_len)
    
max_len          

40

In [12]:
train_sentences.shape

(100000,)

In [13]:
train_sentences_no_stopwards.shape

(100000,)

In [14]:
# constants
EMBEDDING_DIM = 100
MAXLEN = 80
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"
MAX_EXAMPLES = 160000
TRAINING_SPLIT = 0.9

In [15]:
def split_data(input_data, split_ratio):
    input_data_split_size = int(len(input_data) * split_ratio)
    train_data = input_data[0:input_data_split_size]
    val_data = input_data[input_data_split_size:]
    
    return train_data, val_data

# train_questions, val_questions = split_data(train_sentences, TRAINING_SPLIT)
train_targets, val_targets = split_data(train_targets, TRAINING_SPLIT)

train_sentences_no_stopwards, val_sentences_no_stopwards = split_data(train_sentences_no_stopwards, TRAINING_SPLIT)

In [16]:
train_targets.shape

(90000,)

In [17]:
def fit_tokenizer(train_sentences, oov_token):
    
    ### START CODE HERE
    
    # Instantiate the Tokenizer class, passing in the correct values for num_words and oov_token
#     tokenizer = Tokenizer(num_words=40000, oov_token=oov_token)
    tokenizer = Tokenizer(oov_token=oov_token)    
    # Fit the tokenizer to the training sentences
    tokenizer.fit_on_texts(train_sentences)
    
    ### END CODE HERE
    
    return tokenizer

# tokenizer_questions = fit_tokenizer(train_questions, OOV_TOKEN)
# word_index_questions = tokenizer_questions.word_index
# VOCAB_SIZE = len(word_index_questions)

In [18]:
# print(VOCAB_SIZE)

In [19]:
tokenizer_train_sentences_no_stopwards = fit_tokenizer(train_sentences_no_stopwards, OOV_TOKEN)
word_index_train_sentences_no_stopwards = tokenizer_train_sentences_no_stopwards.word_index
VOCAB_SIZE_OTHER = len(word_index_train_sentences_no_stopwards)

In [20]:
print(VOCAB_SIZE_OTHER)

51867


In [21]:
def seq_pad_and_trunc(sentences, tokenizer, padding, truncating, maxlen):
    
    ### START CODE HERE
       
    # Convert sentences to sequences
    sequences = tokenizer.texts_to_sequences(sentences)
    
    # Pad the sequences using the correct padding, truncating and maxlen
#     pad_trunc_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding, truncating=truncating)
    pad_trunc_sequences = pad_sequences(sequences, padding=padding, truncating=truncating)
    
    ### END CODE HERE
    
    return pad_trunc_sequences

# train_pad_trunc_seq = seq_pad_and_trunc(train_questions, tokenizer_train_sentences, PADDING, TRUNCATING, MAXLEN)
# val_pad_trunc_seq = seq_pad_and_trunc(val_questions, tokenizer_train_sentences, PADDING, TRUNCATING, MAXLEN)
test_pad_trunc_seq = seq_pad_and_trunc(test_sentences, tokenizer_train_sentences_no_stopwards, PADDING, TRUNCATING, MAXLEN)

train_sentences_no_stopwards_pad_trunc_seq = seq_pad_and_trunc(train_sentences_no_stopwards, tokenizer_train_sentences_no_stopwards, PADDING, TRUNCATING, MAXLEN)
val_sentences_no_stopwards_pad_trunc_seq = seq_pad_and_trunc(val_sentences_no_stopwards, tokenizer_train_sentences_no_stopwards, PADDING, TRUNCATING, MAXLEN)
print(f"Padded and truncated training sequences have shape: {train_sentences_no_stopwards_pad_trunc_seq.shape}\n")
print(f"Padded and truncated validation sequences have shape: {val_sentences_no_stopwards_pad_trunc_seq.shape}")
print(f"Padded and truncated test sequences have shape: {test_pad_trunc_seq.shape}")

Padded and truncated training sequences have shape: (90000, 185)

Padded and truncated validation sequences have shape: (10000, 38)
Padded and truncated test sequences have shape: (375806, 240)


In [22]:
# GRADED FUNCTION: create_model
def create_model(num_words, embedding_dim, maxlen):
    
    tf.random.set_seed(123)
    
    ### START CODE HERE
    
    model = tf.keras.Sequential([ 
#         tf.keras.layers.Embedding(num_words, embedding_dim, input_length=maxlen),
        tf.keras.layers.Embedding(num_words, embedding_dim),    
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dense(24, activation='relu'),        
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy']) 

    ### END CODE HERE

    return model

In [23]:
model = create_model(VOCAB_SIZE_OTHER+1, EMBEDDING_DIM, MAXLEN)

# history = model.fit(train_pad_trunc_seq, train_targets, epochs=30, validation_data=(val_pad_trunc_seq, val_targets))
history = model.fit(train_sentences_no_stopwards_pad_trunc_seq, train_targets, epochs=10, validation_data=(val_sentences_no_stopwards_pad_trunc_seq, val_targets))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
model_predict = model.predict(test_pad_trunc_seq)



In [25]:
predictions = np.where(model_predict > 0.5, 1, 0)

In [26]:
predictions = predictions.reshape(-1)

In [27]:
predictions_dict = {
    'qid' : test_qid,
    'prediction' : predictions
}
predictions_df = pd.DataFrame(predictions_dict)

In [28]:
predictions_df.to_csv('submission.csv', index=False)