In [154]:
import sys,os
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score, precision_score, recall_score,roc_auc_score,accuracy_score
from sklearn import metrics
from sklearn.feature_extraction.text import HashingVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize 
  


In [2]:
stop_words = set(stopwords.words('english')) 


In [3]:
stemmer = SnowballStemmer('english')
stem_map={}

stopW = stopwords.words('english')
emoji_pattern = re.compile("["
     u"\U0001F600-\U0001F64F"  
     u"\U0001F300-\U0001F5FF"  
     u"\U0001F680-\U0001F6FF"  
     u"\U0001F1E0-\U0001F1FF"  
     u"\U00002702-\U000027B0"
     u"\U000024C2-\U0001F251"
     "]+", flags=re.UNICODE)

def load_data(filename):
    n = ['id', 'text','HS','TR','AG']
    given_data = pd.read_csv(filename, sep='\t',error_bad_lines=False, names=n, usecols=['text','HS','TR','AG'], skiprows=1)
    raw_data = given_data['text'].values
    labels_TR = list(map(int,given_data['TR'].values))
    labels_AG = list(map(int,given_data['AG'].values))
    labels_HS = list(map(int,given_data['HS'].values))
    return raw_data,labels_TR,labels_AG,labels_HS

def preprocess(tweet):
    # ' '.join([word for word in tweet.spilt() ])
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', tweet)
    tweet = re.sub('@[^\s]+','USER', tweet)
    tweet = tweet.replace("ё", "е")
    tweet = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', tweet)
    tweet = re.sub(' +',' ', tweet)
    tweet = emoji_pattern.sub(r'', tweet)

    stemmed_text_token=[]
    twtk = TweetTokenizer(reduce_len=True)
#     tokens = tweet.split(' ')
    tokens = twtk.tokenize(tweet)
    text_token = []
    for token in tokens:
        if token not in stop_words:
            text_token.append(token)
    for token in text_token:
        if token=='':
            continue
        elif token=='USER' or token=='URL': 
            stemmed_text_token.append(token)
        
        else:
            a=stem_map.get(token,0)
            if a==0:
                a=stemmer.stem(token)
                stem_map[token]=a
            stemmed_text_token.append(a)
    return ' '.join(stemmed_text_token)


In [4]:
filename = './train_en.tsv'
raw_data,labels_TR,labels_AG,labels_HS = load_data(filename)

data = [preprocess(tweet) for tweet in raw_data]
# classifier(data,labels_HS)

In [158]:
x_train, x_test, y_train, y_test = train_test_split(data, labels_HS, test_size=0.1, random_state=324)

In [159]:
y_test = np_utils.to_categorical(y_test)

y_test[-10:]

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [160]:
from keras.utils import np_utils
y_train = np_utils.to_categorical(y_train)

In [122]:
import seaborn as sns
import matplotlib.pyplot as plt

import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.utils import plot_model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer 
import os, re, csv, math, codecs
from sklearn.preprocessing import LabelBinarizer


In [123]:
MAX_NB_WORDS = 100000


In [12]:
print('loading word embeddings...')
embeddings_index = {}
def load_embeddings():
    f = codecs.open('../classification/glove.6B.100d.txt', encoding='utf-8')
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))


loading word embeddings...


In [13]:
load_embeddings()

400000it [00:23, 16798.62it/s]

found 400000 word vectors





In [161]:
max_seq_len = 300
print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(x_train + x_test)  #leaky
word_seq_train = tokenizer.texts_to_sequences(x_train)
word_seq_test = tokenizer.texts_to_sequences(x_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)

tokenizing input data...
dictionary size:  12288


In [162]:
#training params
batch_size = 128 
num_epochs = 20 

#model parameters
num_filters = 64 
embed_dim = 100 
weight_decay = 1e-4

In [163]:
#embedding matrix
print('preparing embedding matrix...')
words_not_found = []
length = len(word_index) + 1
nb_words = min(MAX_NB_WORDS, length)
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))


preparing embedding matrix...
number of null word embeddings: 4919


In [164]:
print("sample words not found: ", np.random.choice(words_not_found, 10))


sample words not found:  ['moreov' 'anncoult' 'trifl' 'wherertherefuge' 'goodmorn' 'bitchi' 'movi'
 'therippleeffect' 'masquerad' 'strach']


In [165]:
num_classes = 2
print("training CNN ...")
model = Sequential()
model.add(Embedding(nb_words, embed_dim,
          weights=[embedding_matrix], input_length=max_seq_len, trainable=False))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(3))
model.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
model.add(MaxPooling1D(3))
model.add(Conv1D(num_filters, 3, activation='relu', padding='same'))
model.add(Conv1D(num_filters, 5, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(num_classes, activation='sigmoid'))  #multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

training CNN ...
Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 300, 100)          1228900   
_________________________________________________________________
conv1d_49 (Conv1D)           (None, 300, 64)           44864     
_________________________________________________________________
max_pooling1d_27 (MaxPooling (None, 100, 64)           0         
_________________________________________________________________
conv1d_50 (Conv1D)           (None, 100, 64)           28736     
_________________________________________________________________
max_pooling1d_28 (MaxPooling (None, 33, 64)            0         
_________________________________________________________________
conv1d_51 (Conv1D)           (None, 33, 64)            12352     
_________________________________________________________________
conv1d_52 (Conv1D)           (None, 

In [166]:
#callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [167]:
#model training
hist = model.fit(word_seq_train, y_train, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

Train on 7290 samples, validate on 810 samples
Epoch 1/20
 - 5s - loss: 0.6478 - accuracy: 0.6095 - val_loss: 0.6023 - val_accuracy: 0.6679
Epoch 2/20
 - 5s - loss: 0.5762 - accuracy: 0.6918 - val_loss: 0.5625 - val_accuracy: 0.7123
Epoch 3/20
 - 5s - loss: 0.5226 - accuracy: 0.7429 - val_loss: 0.5542 - val_accuracy: 0.7142
Epoch 4/20
 - 5s - loss: 0.4575 - accuracy: 0.7873 - val_loss: 0.5533 - val_accuracy: 0.7222
Epoch 5/20
 - 5s - loss: 0.3675 - accuracy: 0.8420 - val_loss: 0.6832 - val_accuracy: 0.6778
Epoch 6/20
 - 5s - loss: 0.2671 - accuracy: 0.8953 - val_loss: 0.6834 - val_accuracy: 0.7247
Epoch 00006: early stopping


In [168]:
y_train

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [169]:
y_predict = model.predict(word_seq_test)

y_predict = np.argmax(y_predict, axis=1)
y_test = np.argmax(y_test, axis=1)

print("Precision\t", precision_score(y_test, y_predict, average=None))
print("Recall   \t", recall_score(y_test, y_predict, average=None))
print("F1-Score \t", f1_score(y_test, y_predict, average=None))


Precision	 [0.74115456 0.67493113]
Recall   	 [0.77131783 0.63802083]
F1-Score 	 [0.75593542 0.65595716]


In [170]:
print("ROC-AUC  \t", roc_auc_score(y_test, y_predict, average=None))

ROC-AUC  	 0.7046693313953489


In [171]:
y_predict[:100]

array([0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0])

In [172]:
y_test[:100]

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0])

In [173]:
print("Accuracy", accuracy_score(y_test, y_predict))

Accuracy 0.7144444444444444
