In [1]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Sequential, layers
from keras.callbacks import TensorBoard
import keras.backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from time import time
from keras.models import load_model

In [2]:
#gpu memory growth fix
gpus = tf.config.experimental.list_physical_devices('GPU')

if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
              tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
# loading data
train_df = pd.read_csv("E:/Python/data/twitter/train.csv")
test_df = pd.read_csv("E:/Python/data/twitter/test.csv")

In [4]:
# text preprocessing
import re
from spellchecker import SpellChecker
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
#     # Correcting spelling
#     sentence = correct_spellings(sentence)

    return sentence

In [5]:
X_train = []
sentences = list(train_df['text'])
for sen in sentences:
    X_train.append(preprocess_text(sen))

In [6]:
X_test = []
sentences = list(test_df['text'])
for sen in sentences:
    X_test.append(preprocess_text(sen))

In [7]:
# tokenizing
from keras.preprocessing.text import Tokenizer
num_of_words_to_leave = 10000
tokenizer = Tokenizer(num_words=num_of_words_to_leave)

tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [8]:
# get the maximum length of a processed tokenized tweet
def FindMaxLength(lst): 
    maxLength = max(len(x) for x in lst ) 
    return maxLength 
maxlen=FindMaxLength(sentences)
maxlen

151

In [9]:
# padding sequences
from keras import preprocessing
x_train = preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

In [10]:
# define custom f1-metric for keras
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [12]:
# model and predicting in 4-fold cv-mode
train_y_enc = train_df['target']

from sklearn.model_selection import train_test_split, KFold
import scipy
first_col = True
cross_fold = KFold(n_splits = 4, shuffle=True)
for train_index, test_index in cross_fold.split(x_train):
    validation_X, validation_y = x_train[test_index], train_y_enc[test_index]
    train_X, train_y = x_train[train_index], train_y_enc[train_index]
    
    model = Sequential()
    model.add(layers.Embedding(num_of_words_to_leave, 4))
    model.add(layers.LSTM(4, #return_sequences = True
        #recurrent_dropout=0.2
                         ))
    #model.add(layers.GlobalMaxPool1D())
    # model.add(layers.BatchNormalization())
    model.add(layers.Dropout(0.5))
    # model.add(layers.Dense(128, activation = "relu"))
    # model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))
    # compile the model
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[get_f1])

    tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

    es = EarlyStopping(monitor='val_get_f1', mode='max', min_delta=0.001, verbose=1, patience=10)
    mc = ModelCheckpoint('best_model.h5', monitor='val_get_f1', mode='max', verbose=1, save_best_only=True)
    
    # fit model and save the best
    history = model.fit(train_X, train_y, epochs=100, batch_size=32, callbacks=[
        tensorboard, 
        es, mc
        ], 
        validation_data=(validation_X, validation_y),
    )
    saved_model = load_model('best_model.h5', custom_objects={"get_f1": get_f1})
    
    probs = saved_model.predict(x_test)
    predict = [x[0] for x in probs]
    if first_col:
        pr_values = np.array(predict, ndmin=2)
        pr_values = np.transpose(pr_values)
        first_col = False
    else:
        pr_values = np.insert(pr_values, -1, predict, axis=1)
pr_values= np.mean(pr_values, axis=1)

Epoch 1/100
Epoch 00001: val_get_f1 improved from -inf to 0.10431, saving model to best_model.h5
Epoch 2/100
Epoch 00002: val_get_f1 improved from 0.10431 to 0.62333, saving model to best_model.h5
Epoch 3/100
Epoch 00003: val_get_f1 improved from 0.62333 to 0.64328, saving model to best_model.h5
Epoch 4/100
Epoch 00004: val_get_f1 improved from 0.64328 to 0.66357, saving model to best_model.h5
Epoch 5/100
Epoch 00005: val_get_f1 improved from 0.66357 to 0.67844, saving model to best_model.h5
Epoch 6/100
Epoch 00006: val_get_f1 did not improve from 0.67844
Epoch 7/100
Epoch 00007: val_get_f1 improved from 0.67844 to 0.69263, saving model to best_model.h5
Epoch 8/100
Epoch 00008: val_get_f1 did not improve from 0.69263
Epoch 9/100
Epoch 00009: val_get_f1 did not improve from 0.69263
Epoch 10/100
Epoch 00010: val_get_f1 did not improve from 0.69263
Epoch 11/100
Epoch 00011: val_get_f1 did not improve from 0.69263
Epoch 12/100
Epoch 00012: val_get_f1 did not improve from 0.69263
Epoch 13/1

Epoch 00010: val_get_f1 improved from 0.71558 to 0.71860, saving model to best_model.h5
Epoch 11/100
Epoch 00011: val_get_f1 did not improve from 0.71860
Epoch 12/100
Epoch 00012: val_get_f1 did not improve from 0.71860
Epoch 13/100
Epoch 00013: val_get_f1 did not improve from 0.71860
Epoch 14/100
Epoch 00014: val_get_f1 did not improve from 0.71860
Epoch 15/100
Epoch 00015: val_get_f1 did not improve from 0.71860
Epoch 16/100
Epoch 00016: val_get_f1 did not improve from 0.71860
Epoch 17/100
Epoch 00017: val_get_f1 did not improve from 0.71860
Epoch 18/100
Epoch 00018: val_get_f1 improved from 0.71860 to 0.72034, saving model to best_model.h5
Epoch 19/100
Epoch 00019: val_get_f1 did not improve from 0.72034
Epoch 20/100
Epoch 00020: val_get_f1 did not improve from 0.72034
Epoch 21/100
Epoch 00021: val_get_f1 did not improve from 0.72034
Epoch 22/100
Epoch 00022: val_get_f1 did not improve from 0.72034
Epoch 23/100
Epoch 00023: val_get_f1 did not improve from 0.72034
Epoch 24/100
Epoch 

Epoch 00009: val_get_f1 did not improve from 0.70412
Epoch 10/100
Epoch 00010: val_get_f1 improved from 0.70412 to 0.70925, saving model to best_model.h5
Epoch 11/100
Epoch 00011: val_get_f1 did not improve from 0.70925
Epoch 12/100
Epoch 00012: val_get_f1 did not improve from 0.70925
Epoch 13/100
Epoch 00013: val_get_f1 did not improve from 0.70925
Epoch 14/100
Epoch 00014: val_get_f1 did not improve from 0.70925
Epoch 15/100
Epoch 00015: val_get_f1 improved from 0.70925 to 0.71203, saving model to best_model.h5
Epoch 16/100
Epoch 00016: val_get_f1 did not improve from 0.71203
Epoch 17/100
Epoch 00017: val_get_f1 did not improve from 0.71203
Epoch 18/100
Epoch 00018: val_get_f1 did not improve from 0.71203
Epoch 19/100
Epoch 00019: val_get_f1 did not improve from 0.71203
Epoch 20/100
Epoch 00020: val_get_f1 did not improve from 0.71203
Epoch 21/100
Epoch 00021: val_get_f1 did not improve from 0.71203
Epoch 22/100
Epoch 00022: val_get_f1 did not improve from 0.71203
Epoch 23/100
Epoch 

Epoch 12/100
Epoch 00012: val_get_f1 did not improve from 0.72171
Epoch 13/100
Epoch 00013: val_get_f1 did not improve from 0.72171
Epoch 14/100
Epoch 00014: val_get_f1 did not improve from 0.72171
Epoch 15/100
Epoch 00015: val_get_f1 did not improve from 0.72171
Epoch 16/100
Epoch 00016: val_get_f1 did not improve from 0.72171
Epoch 17/100
Epoch 00017: val_get_f1 did not improve from 0.72171
Epoch 18/100
Epoch 00018: val_get_f1 did not improve from 0.72171
Epoch 00018: early stopping


In [13]:
# probabilities
pr_values

array([0.65323895, 0.61684036, 0.8721913 , ..., 0.8428601 , 0.91084456,
       0.7592432 ], dtype=float32)

In [14]:
# saving the result
z = zip(test_df.iloc[:, 0], [int(x) for x in np.round(pr_values)])
data3 = pd.DataFrame(z, columns=['id', 'target'])
data3.to_csv('ss10.csv', index=False)