In [0]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [2]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Import numpy, Pandas, Regex & Visualisation libraries
import numpy as np
import pandas as pd
import re
from matplotlib import pyplot as plt

# Import NLTK & Download Required Module
import nltk
nltk.download('punkt')

# Train Test Split
from sklearn.model_selection import train_test_split

# F1 Score
from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
tweets = pd.read_csv('/content/train_E6oV3lV.csv')

In [0]:
X = tweets.iloc[:, 2].values
y = tweets.iloc[:,1].values

In [0]:
def clean_corpus(text):
    corpus = []
    for i in range(len(text)):
        tweet = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s"," ", str(text[i]))
        tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s"," ", tweet)
        tweet = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$"," ", tweet)
        tweet = tweet.lower()
        tweet = re.sub(r"can't","can not", tweet)
        tweet = re.sub(r"hv","have", tweet)
        tweet = re.sub(r"ur","your", tweet)
        tweet = re.sub(r"ain't","is not", tweet)
        tweet = re.sub(r"don't","do not", tweet)
        tweet = re.sub(r"couldn't","could not", tweet)
        tweet = re.sub(r"shouldn't","should not", tweet )
        tweet = re.sub(r"won't","will not", tweet)
        tweet = re.sub(r"there's", "there is", tweet)
        tweet = re.sub(r"it's","it is", tweet)
        tweet = re.sub(r"that's","that is", tweet)
        tweet = re.sub(r"where's","where is", tweet)
        tweet = re.sub(r"who's","who is", tweet)
        tweet = re.sub(r"\W"," ", tweet)
        tweet = re.sub(r"\d"," ", tweet)
        tweet = re.sub(r"[ðâï¼½³ªãºæååçæåä¹µó¾_ëìêè]"," ", tweet)
        tweet =re.sub(r"\s[a-z]\s"," ", tweet)
        tweet = re.sub(r"\s+[a-z]\s+"," ", tweet)
        tweet = re.sub(r"^[a-z]\s"," ", tweet)
        tweet = re.sub(r"^[a-z]\s+"," ", tweet)
        tweet = re.sub(r"\s+"," ", tweet)
        tweet = re.sub(r"^\s","", tweet)
        tweet = re.sub(r"\s$","", tweet)
        corpus.append(tweet)
        
    #return the corpus
    return corpus

In [0]:
#check how many individual words present in the corpus
word_dict = {}
train_data = tweets['tweet']
corpus  = clean_corpus(train_data)

for doc in corpus:
    words = nltk.word_tokenize(doc)
    for word in words:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1
            
len(word_dict)

#tokenising the texts
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus_tokens = tokenizer.texts_to_sequences(corpus)

In [8]:
#finding the average words present per comment
print(corpus[0])
print(corpus_tokens[0:2])

num_of_words_in_doc =[]
for doc in corpus_tokens:
    num_of_words_in_doc.append(len(doc))
print("Average number of words: ", np.average(num_of_words_in_doc))


# Padding the sequences
corpus_pad = keras.preprocessing.sequence.pad_sequences(corpus_tokens,maxlen=25,padding='post')

user when father is dysfunctional and is so selfish he drags his kids into his dysfunction run
[[1, 35, 68, 5, 14946, 6, 5, 21, 2610, 70, 6257, 93, 244, 249, 93, 7643, 442], [1, 1, 161, 8, 5345, 2343, 32, 12, 426, 623, 60, 23, 12, 1481, 7644, 9812, 7, 7645, 14947, 9813]]
Average number of words:  12.311182028659033


In [9]:

# Creating Validation Set
X_train,X_test,y_train,y_test = train_test_split(corpus_pad,y,test_size=0.2,random_state=101)

X_train.shape, X_test.shape

((25569, 25), (6393, 25))

In [10]:
# Building & Compiling the model

vocab_size = len(tokenizer.word_index) + 1
max_length = 25
model = keras.Sequential()
model.add(keras.layers.Embedding(input_dim=vocab_size,output_dim=50,input_length=max_length))
model.add(keras.layers.LSTM(units=50,dropout=0.2,recurrent_dropout=0.2))
model.add(keras.layers.Dense(units=1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 50)            1879200   
_________________________________________________________________
lstm (LSTM)                  (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 1,899,451
Trainable params: 1,899,451
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
# Train the model
model.fit(X_train,y_train,batch_size=10,epochs=2, verbose=2)

Epoch 1/2
2557/2557 - 103s - loss: 0.1566 - acc: 0.9501
Epoch 2/2
2557/2557 - 101s - loss: 0.0697 - acc: 0.9776


<tensorflow.python.keras.callbacks.History at 0x7ff4fab6d390>

In [0]:

#Loading the test data
test_tweets = pd.read_csv("test_tweets_anuFYb8.csv")
test_tweets.shape

#cleaning the text
test_data = test_tweets['tweet']
clean_test_data  = clean_corpus(test_data)

#text to sequence and padding
clean_test_data_token = tokenizer.texts_to_sequences(clean_test_data)
clean_test_data_pad = keras.preprocessing.sequence.pad_sequences(clean_test_data_token,maxlen=25,padding='post')

In [14]:
# preparing the submission file    
final_prediction = model.predict_classes(clean_test_data_pad)

test_tweets['label'] = final_prediction
test_predictions = test_tweets[['id','label']]
test_predictions.to_csv('LSTM3.csv',index=False)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [15]:
test_predictions

Unnamed: 0,id,label
0,31963,0
1,31964,0
2,31965,0
3,31966,0
4,31967,0
...,...,...
17192,49155,1
17193,49156,0
17194,49157,0
17195,49158,0
