In [11]:
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import  Counter
stop=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D,Dropout
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

#!pip install pyspellchecker
from spellchecker import SpellChecker
import pandas as pd
import re
import string
from nltk.corpus import stopwords
#nltk.download("stopwords")


In [12]:
stop_words = stopwords.words("english")

#############################
### Get Data ##

train= pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
# concat all
df=pd.concat([train,test])

In [13]:
spell = SpellChecker()

In [14]:
def correct_spellings(x, spell=spell):
    """correct the missplled words of a given tweet"""
    x = x.split()
    misspelled = spell.unknown(x)
    result = map(lambda word : spell.correction(word) if word in  misspelled else word, x)
    return " ".join(result)

def tweets_cleaning(x, remove_stop_words=True):
    """Apply function to a clean a tweet"""
    x = x.lower().strip()
    # romove urls
    url = re.compile(r'https?://\S+|www\.\S+')
    x = url.sub(r'',x)
    # remove html tags
    html = re.compile(r'<.*?>')
    x = html.sub(r'',x)
    # remove punctuation
    x= re.sub('[^a-zA-Z]', ' ', x)
    if remove_stop_words:
        x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    return x


In [36]:
## APPLY the cleaning function to the text column
df['cleaned_tweets'] = df['text'].apply(tweets_cleaning)
train = df[~df['target'].isna()]
X_train, X_val, y_train, y_val = train_test_split(train, train['target'], test_size=0.2, random_state=42)
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

# Limit on the number of features to K features.
TOP_K = 20000

# Limit on the length of text sequences.
# Sequences longer than this will be truncated.
# and less than it will be padded
MAX_SEQUENCE_LENGTH = 50


In [16]:
class CustomTokenizer:
    def __init__(self, train_texts):
        self.train_texts = train_texts
        self.tokenizer = Tokenizer(num_words=TOP_K)

    def train_tokenize(self):
        # Get max sequence length.
        max_length = len(max(self.train_texts, key=len))
        self.max_length = min(max_length, MAX_SEQUENCE_LENGTH)

        # Create vocabulary with training texts.
        self.tokenizer.fit_on_texts(self.train_texts)

    def vectorize_input(self, tweets):
        # Vectorize training and validation texts.

        tweets = self.tokenizer.texts_to_sequences(tweets)
        # Fix sequence length to max value. Sequences shorter than the length are
        # padded in the beginning and sequences longer are truncated
        # at the beginning.
        tweets = sequence.pad_sequences(tweets, maxlen=self.max_length, truncating='post', padding='post')
        return tweets


In [17]:
tokenizer = CustomTokenizer(train_texts=X_train['cleaned_tweets'])
# fit o the train
tokenizer.train_tokenize()
tokenized_train = tokenizer.vectorize_input(X_train['cleaned_tweets'])
tokenized_val = tokenizer.vectorize_input(X_val['cleaned_tweets'])
tokenized_test = tokenizer.vectorize_input(test['text'])
wordembeddings = gensim.models.KeyedVectors.load_word2vec_format('/home/akshay/Documents/archive/project/GoogleNews-vectors-negative300.bin',binary=True)
import tqdm

EMBEDDING_VECTOR_LENGTH = 300  # <=200


In [18]:
def construct_embedding_matrix(wordembeddings, word_index):
    unique_words = len(word_index)
    total_words = unique_words + 1
    skipped_words = 0
    embedding_matrix = np.zeros((total_words, EMBEDDING_VECTOR_LENGTH))
    for word, index in tokenizer.tokenizer.word_index.items():
        try:
            embedding_vector = wordembeddings[word]
        except:
            skipped_words = skipped_words + 1
            pass
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
    return embedding_matrix


In [19]:
embedding_matrix = construct_embedding_matrix(wordembeddings, tokenizer.tokenizer.word_index)
model=Sequential()
total_words = len(tokenizer.tokenizer.word_index)+1
embedding=Embedding(len(tokenizer.tokenizer.word_index)+1, # number of unique tokens
                    EMBEDDING_VECTOR_LENGTH, #number of features
                    embeddings_initializer=Constant(embedding_matrix), # initialize
                    input_length=MAX_SEQUENCE_LENGTH,
                    trainable=False)


In [20]:
model.add(embedding)
model.add(Dropout(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
# compile the model
optimzer = Adam(clipvalue=0.5) # clip value to avoid the gradient exploding

model.compile(optimizer=optimzer,
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [21]:
# fit the model
history = model.fit(tokenized_train, y_train,
                    batch_size=32,
                    epochs=10,
                    validation_data=(tokenized_val,y_val),
                    verbose=2)

print(model)

Epoch 1/20
191/191 - 8s - loss: 0.5518 - accuracy: 0.7340 - val_loss: 0.4594 - val_accuracy: 0.7951
Epoch 2/20
191/191 - 6s - loss: 0.4850 - accuracy: 0.7806 - val_loss: 0.4549 - val_accuracy: 0.8037
Epoch 3/20
191/191 - 7s - loss: 0.4747 - accuracy: 0.7837 - val_loss: 0.4492 - val_accuracy: 0.7984
Epoch 4/20
191/191 - 7s - loss: 0.4621 - accuracy: 0.7984 - val_loss: 0.4552 - val_accuracy: 0.8122
Epoch 5/20
191/191 - 7s - loss: 0.4503 - accuracy: 0.7982 - val_loss: 0.4386 - val_accuracy: 0.8109
Epoch 6/20
191/191 - 7s - loss: 0.4406 - accuracy: 0.8026 - val_loss: 0.4448 - val_accuracy: 0.8194
Epoch 7/20
191/191 - 7s - loss: 0.4322 - accuracy: 0.8085 - val_loss: 0.4483 - val_accuracy: 0.8162
Epoch 8/20
191/191 - 7s - loss: 0.4315 - accuracy: 0.8074 - val_loss: 0.4316 - val_accuracy: 0.8102
Epoch 9/20
191/191 - 7s - loss: 0.4283 - accuracy: 0.8112 - val_loss: 0.4349 - val_accuracy: 0.8142
Epoch 10/20
191/191 - 7s - loss: 0.4173 - accuracy: 0.8182 - val_loss: 0.5065 - val_accuracy: 0.7886

In [27]:
X_val

Unnamed: 0,id,keyword,location,text,target,cleaned_tweets
2644,3796,destruction,,So you have a new weapon that can cause un-ima...,1.0,new weapon cause un imaginable destruction
2227,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just...,0.0,f amp ing things gishwhes got soaked deluge...
5448,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe...,1.0,dt georgegalloway rt galloway mayor col...
132,191,aftershock,,Aftershock back to school kick off was great. ...,0.0,aftershock back school kick great want thank ...
6845,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts deve...,0.0,response trauma children addicts develop defen...
...,...,...,...,...,...,...
1835,2640,crashed,Somewhere,@SmusX16475 Skype just crashed u host,0.0,smusx skype crashed u host
506,731,attacked,Arundel,Christian Attacked by Muslims at the Temple Mo...,1.0,christian attacked muslims temple mount waving...
3592,5131,fatal,"New South Wales, Australia",Man charged over fatal crash near Dubbo refuse...,1.0,man charged fatal crash near dubbo refused bai...
6740,9657,thunderstorm,,#usNWSgov Severe Weather Statement issued Augu...,1.0,usnwsgov severe weather statement issued augu...


In [32]:
y_pred= model.predict_classes(tokenized_val)



In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val,y_pred)

array([[797,  77],
       [200, 449]])

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val,y_pred)

0.8181221273801708