In [1]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Flatten, Input, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import re
import string

### Read data

In [2]:
data = pd.read_csv('../Data/ver1.csv', index_col=0)
data.drop(["index"], inplace = True, axis=1)
data.columns = ['Text', 'oh_label']

In [3]:
# 1 it's represent negative and 0 for positive
pos = []
neg = []
for l in data.oh_label:
    if l == 0:
        pos.append(1)
        neg.append(0)
    elif l == 1:
        pos.append(0)
        neg.append(1)
data['Pos']= pos
data['Neg']= neg

In [4]:
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))
from nltk import word_tokenize, WordNetLemmatizer
tokens = [word_tokenize(sen) for sen in data.Text_Clean]
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]
from nltk.corpus import stopwords
stoplist = stopwords.words('english') 
def remove_stop_words(tokens):
    return [word for word in tokens if word not in stoplist]
filtered_words = [remove_stop_words(sen) for sen in lower_tokens] 
result = [' '.join(sen) for sen in filtered_words]

data['Text_Final'] = result
data['tokens'] = filtered_words
data = data[['Text_Final', 'tokens', 'oh_label', 'Pos', 'Neg']]
# tokens

In [5]:
data.head()

Unnamed: 0,Text_Final,tokens,oh_label,Pos,Neg
0,im 12 understand perfectly learn english inste...,"[im, 12, understand, perfectly, learn, english...",1.0,0,1
1,mkr boy fourinhandyou sure know dish insult sa...,"[mkr, boy, fourinhandyou, sure, know, dish, in...",0.0,1,0
2,fuck site full stuck cunts unknown reason thin...,"[fuck, site, full, stuck, cunts, unknown, reas...",1.0,0,1
3,dont disagree point except im exactly trying a...,"[dont, disagree, point, except, im, exactly, t...",0.0,1,0
4,look like give fuck u sad fucks nothing better...,"[look, like, give, fuck, u, sad, fucks, nothin...",1.0,0,1


### Split data into test and train

In [27]:
data_train, data_test = train_test_split(data, test_size=0.30, random_state=42)
data_test, data_test_final = train_test_split(data_test, test_size=0.50, random_state=42)

In [28]:
x_neg_weight = data_train[data_train["oh_label"]==1].shape
x_pos_weight = data_train[data_train["oh_label"]==0].shape
x_neg_weight[0]/x_pos_weight[0]
weights = {0: (x_neg_weight[0]/(x_pos_weight[0]+x_neg_weight[0])),1: (x_pos_weight[0]/(x_pos_weight[0]+x_neg_weight[0]))}

In [12]:
def get_vocab(df,key):
    all_training_words = [word for tokens in df[key] for word in tokens]
    training_sentence_lengths = [len(tokens) for tokens in df[key]]
    vocab = sorted(list(set(all_training_words)))
    print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(vocab)))
    print("Max sentence length is %s" % max(training_sentence_lengths))
    return vocab
TRAINING_VOCAB = get_vocab(data_train,"tokens")
TEST_VOCAB = get_vocab(data_test,"tokens")

2290656 words total, with a vocabulary size of 131067
Max sentence length is 2481
483005 words total, with a vocabulary size of 50477
Max sentence length is 2494


### Load Google News Word2Vec model

In [None]:
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [22]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 50

### Tokenize and Pad sequences

In [22]:
import tensorflow as tf
def to_one_hot(df,vocb_len,TEXT_COLUMN_NAME):
    tokenizer = Tokenizer(num_words=vocb_len, lower=True, char_level=False)
    tokenizer.fit_on_texts(df[TEXT_COLUMN_NAME].tolist())
    training_sequences = tokenizer.texts_to_sequences(df[TEXT_COLUMN_NAME].tolist())

In [30]:
data_train

Unnamed: 0,Text_Final,tokens,oh_label,Pos,Neg
76513,please stop removing content wikipedia conside...,"[please, stop, removing, content, wikipedia, c...",0.0,1,0
60406,adult discussion please found today comments p...,"[adult, discussion, please, found, today, comm...",0.0,1,0
27322,wikinews demo running hi im writing let know w...,"[wikinews, demo, running, hi, im, writing, let...",0.0,1,0
53699,message sponsors,"[message, sponsors]",0.0,1,0
65412,cant figure conversation even first place with...,"[cant, figure, conversation, even, first, plac...",0.0,1,0
...,...,...,...,...,...
6265,howdy hows weather los angeles,"[howdy, hows, weather, los, angeles]",0.0,1,0
54886,almost links provided good sources tretiakov h...,"[almost, links, provided, good, sources, treti...",0.0,1,0
76820,rt emilyylam 14 teams screen means one good th...,"[rt, emilyylam, 14, teams, screen, means, one,...",0.0,1,0
860,german reich 1935–1945svg,"[german, reich, 1935–1945svg]",0.0,1,0


In [31]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation), '')


# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
vectorize_layer.adapt(data_train["Text_Final"])

In [34]:
print(data_train["Text_Final"][0])
vectorize_layer.apply(data_train["Text_Final"])[0]

im 12 understand perfectly learn english instead making us dumb




<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([  10,   50,  352,  128,    4,  362,   84,   27, 1517,   10,   25,
        729,   80,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int64)>

In [None]:
# Lambda(lambda x: tf.one_hot(x[:,0], len(set(X))))(inp)

In [20]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 130531 unique tokens.


In [23]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
train_cnn_data.shape

(70000, 50)

In [24]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

NameError: name 'word2vec' is not defined

In [28]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_cnn_data[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     3,   255,    50,  1219,
          41,     1,   348,    52,   697,  8499,    19,   561,   764,
         808,     5,   157,   734,  1050,  2729,   601,  1042,  2401,
         692,  3201,     5, 22842,  4007])

### Define CNN

In [36]:
label_names = ['Pos', 'Neg']

In [37]:
y_train = data_train[label_names].values

In [38]:
x_train = train_cnn_data
y_tr = y_train

In [39]:
from keras.layers import Conv1D,MaxPool1D,Flatten
import tensorflow as tf

In [40]:
def cnn(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    conv1d = Conv1D(20,7,strides=1, use_bias=True,padding="valid")(embedded_sequences)
    max_pool = MaxPool1D(pool_size=3)(conv1d)
    flattened = Flatten()(max_pool)
    x1 = Dense(128, activation='relu')(flattened)
    x1 = Dropout(0.2)(x1)
    x2 = Dense(64, activation='relu')(x1)
    preds = Dense(labels_index, activation='sigmoid')(x2)
    model = Model(sequence_input, preds)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),metrics=["accuracy",tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])
    model.summary()
    return model

In [41]:
model = cnn(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 50, 300)           39159600  
_________________________________________________________________
conv1d (Conv1D)              (None, 44, 20)            42020     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 14, 20)            0         
_________________________________________________________________
flatten (Flatten)            (None, 280)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               35968     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0     

### Train CNN

In [42]:
num_epochs = 100
batch_size = 150

In [59]:
# callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=20)
hist = model.fit(x_train, y_tr, epochs=10, validation_split=0.1, shuffle=True,class_weight=weights, batch_size=batch_size,use_multiprocessing=True,workers=8)
hist

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e3288143d0>

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
predictions = model.predict(test_cnn_data)
prediction_labels=[]
labels = [0, 1]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])
print(classification_report(data_test.oh_label,prediction_labels))

In [46]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
hist2 = model.fit(x_train, y_tr, epochs=100, validation_split=0.1, shuffle=True,class_weight=weights, batch_size=batch_size,use_multiprocessing=True,workers=8,callbacks=[callback])
hist2

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1ffe2dbec40>

In [48]:
from sklearn.metrics import confusion_matrix,classification_report
predictions = model.predict(test_cnn_data)
prediction_labels=[]
labels = [0, 1]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])
print(classification_report(data_test.oh_label,prediction_labels))

              precision    recall  f1-score   support

         0.0       0.92      0.85      0.88     11039
         1.0       0.66      0.78      0.72      3961

    accuracy                           0.84     15000
   macro avg       0.79      0.82      0.80     15000
weighted avg       0.85      0.84      0.84     15000



In [49]:
embeddings, max_sequence_length, num_words, embedding_dim, labels_index = train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, len(list(label_names))
embedding_layer = Embedding(num_words,
                        embedding_dim,
                        weights=[embeddings],
                        input_length=max_sequence_length,
                        trainable=False)

sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

conv1d = Conv1D(20,7,strides=1, use_bias=True,padding="valid")(embedded_sequences)
max_pool = MaxPool1D(pool_size=3)(conv1d)
flattened = Flatten()(max_pool)
x1 = Dropout(0.5)(flattened)
x2 = Dense(64, activation='relu')(x1)
x2 = Dropout(0.2)(x2)
preds = Dense(labels_index, activation='sigmoid')(x2)
model = Model(sequence_input, preds)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),metrics=["accuracy",tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 300)           39159600  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 44, 20)            42020     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 14, 20)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 280)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 280)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                1798

In [50]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)
hist3 = model.fit(x_train, y_tr, epochs=1000, validation_split=0.1, shuffle=True,class_weight=weights, batch_size=batch_size,use_multiprocessing=True,workers=8,callbacks=[callback])
hist3

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x1ffe2d3ebb0>