In [26]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import metrics as kmetrics
from keras import initializers, regularizers, constraints, optimizers, layers

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import sklearn.metrics as metrics
import xgboost as xgb

import matplotlib.pyplot as plt

In [2]:
def read_csv_file(file_name):
    data = pd.read_csv(file_name)
    return data

def clean_text(text):
    " Clean the text from special characters, links, punctuation and words with numbers in them "
    
    # Modify text in lower case
    text = str(text).lower()
    
    # Delete special characters
    text = re.sub('\[.*?\]', '', text)
    
    # Delete links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    # Delete punctuation
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #text = re.sub('\n', '', text)
    
    # Delete words with numbers in them
    text = re.sub('\w*\d\w*', '', text)
    return text

def preprocessing_data(text):
    " Preprocess the data to be more readable for the model"
    
    # Download stopwords if not downloaded
    if stopwords is None:
        nltk.download('stopwords')
     
    # Clean text   
    text = clean_text(text)
    
    # Remove stop words
    stop_words = stopwords.words('english')
    more_stopwords = ['u', 'im', 'c']
    stop_words = stop_words + more_stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)

    # Stemming all words
    #stemmer = nltk.SnowballStemmer("english")
    #text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text

In [59]:
# Read csv files and save them in dataframes
    
# labels = id, comment_text, toxic, severe_toxic, obscene, threat, insult, identity_hate
train = read_csv_file('data/train.csv')
   
# labels = id, comment_text
test = read_csv_file('data/test.csv')
    
# labels = id, toxic, severe_toxic, obscene, threat, insult, identity_hate
test_labels = read_csv_file('data/test_labels copy.csv')

embedding_file='data/glove.6B.50d.txt'

In [60]:
# Clean the data
train['comment_text_clean'] = train['comment_text'].apply(preprocessing_data)
test['comment_text_clean'] = test['comment_text'].apply(preprocessing_data)

In [61]:
train['comment_text_clean'].fillna("unknown", inplace=True)
test['comment_text_clean'].fillna("unknown", inplace=True)

x_train = train['comment_text_clean']
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[list_classes].values
    
x_test = test['comment_text_clean']
y_test = test_labels[list_classes].values

In [62]:
# How many unique words to use (i.e num rows in embedding vector)
max_features = 20000
# Max number of words in each comment
maxlen = 100

# Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train))
list_tokenized_train = tokenizer.texts_to_sequences(x_train)
list_tokenized_test = tokenizer.texts_to_sequences(x_test)
x_training = pad_sequences(list_tokenized_train, maxlen=maxlen)
x_testing = pad_sequences(list_tokenized_test, maxlen=maxlen)


In [14]:
# Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))

# Use these vectors to create our embedding matrix,
# with random initialization for words that aren't in GloVe.

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

embed_size = 50 # how big is each word vector

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


  if await self.run_code(code, result, async_=asy):


In [17]:
# Bidirectional LSTM with two fully connected layers

inp = Input(shape=(maxlen,))

# Embedding layer, 20000 is the size of the vocabulary, 50 is the dimensionality of the output space
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

# LSTM is a type of RNN, 50 is the dimensionality of the output space
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x) 

# Global max pooling
x = GlobalMaxPool1D()(x) 

# Fully connected layer
x = Dense(50, activation="relu")(x) 

# Dropout to avoid overfitting
x = Dropout(0.1)(x)

# 6 output nodes, for 6 categories
x = Dense(6, activation="sigmoid")(x) 

# Compile the model
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [74]:
history = model.fit(x_training, y_train, batch_size=32, epochs=2, validation_split=0.1)


Epoch 1/2
Epoch 2/2


In [75]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_testing, y_test, batch_size=128)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [6.582034587860107, 0.9890509247779846]


In [79]:
#prediction_train = model.predict([x_training], batch_size=1024, verbose=1)
#prediction_test = model.predict([x_testing], batch_size=1024, verbose=1)
history.history

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`

{'loss': [0.03478873893618584, 0.03211464360356331],
 'accuracy': [0.9374847412109375, 0.9014643430709839],
 'val_loss': [0.049757570028305054, 0.055527154356241226],
 'val_accuracy': [0.9911643266677856, 0.9897856712341309]}

In [None]:
#re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
#def tokenize(s): return re_tok.sub(r' \1 ', s).split()

#n = train.shape[0]
#vec = <TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
#               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
#               smooth_idf=1, sublinear_tf=1 )
#trn_term_doc = vec.fit_transform(train['comment_text_clean'])
#test_term_doc = vec.transform(test['comment_text_clean'])



In [None]:

'''
pipe = Pipeline([
        ('bow', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('model', xgb.XGBClassifier(
            use_label_encoder=False, 
            eval_metric='auc',
            ))
    ])
    
print("------------ Fit the model ------------ \n")
pipe.fit(x_train, y_train)
    
y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)
'''

In [None]:
'''
print("------------ Results ------------ \n")
print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

matrix = metrics.confusion_matrix(y_test, y_pred_class, labels = pipe.classes_)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=matrix, display_labels=pipe.classes_)
color = 'white'
disp.plot()
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
'''