In [62]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Sequential, layers
from keras.callbacks import TensorBoard
import keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from time import time

In [63]:
#gpu memory growth fix
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
              tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [64]:
train_df = pd.read_csv("E:/Python/data/twitter/train.csv")
test_df = pd.read_csv("E:/Python/data/twitter/test.csv")

In [65]:
test_df

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [66]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)

In [67]:
import re
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [68]:
X_train = []
sentences = list(train_df['text'])
for sen in sentences:
    X_train.append(preprocess_text(sen))

In [69]:
X_test = []
sentences = list(test_df['text'])
for sen in sentences:
    X_test.append(preprocess_text(sen))

In [70]:
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [71]:
def FindMaxLength(lst): 
    maxLength = max(len(x) for x in lst ) 
    return maxLength 

In [72]:
maxlen=FindMaxLength(sentences)
maxlen

151

In [73]:
from keras import preprocessing
x_train = preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

In [74]:
x_test.shape

(3263, 151)

In [75]:
x_test

array([[   0,    0,    0, ..., 1948,  122,   89],
       [   0,    0,    0, ...,  602, 2061,  213],
       [   0,    0,    0, ...,  344,   95,   39],
       ...,
       [   0,    0,    0, ...,  855,    2,    1],
       [   0,    0,    0, ..., 1917,    2,    1],
       [   0,    0,    0, ...,   74,  261, 4566]])

In [76]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [77]:
# define model
model = Sequential()
model.add(layers.Embedding(5000, 32))
model.add(layers.LSTM(32, dropout=0.5,
    #recurrent_dropout=0.2
                     ))
model.add(layers.Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[get_f1])

tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

es = EarlyStopping(monitor='val_get_f1', mode='max', min_delta=0.001, verbose=1, patience=10)
mc = ModelCheckpoint('best_model.h5', monitor='val_get_f1', mode='max', verbose=1, save_best_only=True)

model.fit(x_train, train_df['target'], epochs=100, batch_size=32, callbacks=[
    tensorboard, 
    es, mc
    ], 
    validation_split=0.2
)

Epoch 1/100
Epoch 00001: val_get_f1 improved from -inf to 0.61962, saving model to best_model.h5
Epoch 2/100
Epoch 00002: val_get_f1 improved from 0.61962 to 0.63786, saving model to best_model.h5
Epoch 3/100
Epoch 00003: val_get_f1 improved from 0.63786 to 0.65320, saving model to best_model.h5
Epoch 4/100
Epoch 00004: val_get_f1 did not improve from 0.65320
Epoch 5/100
Epoch 00005: val_get_f1 did not improve from 0.65320
Epoch 6/100
Epoch 00006: val_get_f1 did not improve from 0.65320
Epoch 7/100
Epoch 00007: val_get_f1 did not improve from 0.65320
Epoch 8/100
Epoch 00008: val_get_f1 did not improve from 0.65320
Epoch 9/100
Epoch 00009: val_get_f1 did not improve from 0.65320
Epoch 10/100
Epoch 00010: val_get_f1 did not improve from 0.65320
Epoch 11/100
Epoch 00011: val_get_f1 did not improve from 0.65320
Epoch 12/100
Epoch 00012: val_get_f1 improved from 0.65320 to 0.65901, saving model to best_model.h5
Epoch 13/100
Epoch 00013: val_get_f1 did not improve from 0.65901
Epoch 14/100
E

<tensorflow.python.keras.callbacks.History at 0x126ac4b57f0>

In [78]:
# make a prediction
yhat = model.predict(x_test)

In [79]:
yhat

array([[0.2295759 ],
       [0.7769267 ],
       [0.9962835 ],
       ...,
       [0.96105844],
       [0.98051625],
       [0.9901377 ]], dtype=float32)

In [80]:
yhat = np.round(yhat)

In [81]:
z = zip(test_df.iloc[:, 0], [int(x[0]) for x in yhat])
data3 = pd.DataFrame(z, columns=['id', 'target'])
data3.to_csv('ss3.csv', index=False)