<a href="https://colab.research.google.com/github/athishr88/PCLDetection/blob/main/PCL_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
df_main_lower = pd.read_csv("dontpatronizeme_pcl.tsv",names=["par_ID", "art_ID", "Keyword", "country_code", "text", "Label"], skiprows=4, sep='\t')
df_main_lower["text"] = df_main_lower['text'].str.lower()
df_main_lower

Unnamed: 0,par_ID,art_ID,Keyword,country_code,text,Label
0,1,@@24942188,hopeless,ph,"we 're living in times of absolute insanity , ...",0
1,2,@@21968160,migrant,gh,"in libya today , there are countless number of...",0
2,3,@@16584954,immigrant,ie,white house press secretary sean spicer said t...,0
3,4,@@7811231,disabled,nz,council customers only signs would be displaye...,0
4,5,@@1494111,refugee,ca,""" just like we received migrants fleeing el sa...",0
...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,sri lankan norms and culture inhibit women fro...,1
10465,10466,@@70091353,vulnerable,ph,he added that the afp will continue to bank on...,0
10466,10467,@@20282330,in-need,ng,""" she has one huge platform , and information ...",3
10467,10468,@@16753236,hopeless,in,""" anja ringgren loven i ca n't find a word to ...",4


In [44]:
df_main_lower.iloc[10467].text

'" anja ringgren loven i ca n\'t find a word to describe how i feel for you .... may god almighty keep blessing you and always give you strength and sound health to continue your good work ..... you gave hope to the hopeless ! ! ! ! have so much respect for you .. stay blessed my good fellow ... " says one commenter on facebook . " god bless you and your mission . glad to see hope ( and all the children ) growing up loved , well fed , happy , having fun , and going to school , " says another .'

In [24]:
# Load datasets
train_samples = df_main_lower.text[:5500]
test_samples = df_main_lower.text[5500:]

train_labels = df_main_lower.Label[:5500]
test_labels = df_main_lower.Label[5500:]
## Convert each label in the output to a unique integer
classes = list(set(train_labels))
classes

[0, 1, 2, 3, 4]

In [25]:
from collections import Counter

Counter = Counter(test_labels)
most_occur = Counter.most_common(6)
most_occur

[(0, 4033), (1, 455), (3, 217), (4, 197), (2, 67)]

In [26]:
# Create an all token list

all_tokens = [str(x) for x in train_samples]
all_tokens = " ".join(all_tokens)
all_tokens = all_tokens.split(" ")

In [27]:
## Create our Text Vectorizer to index our vocabulary based on the train samples 
from keras.layers import TextVectorization
import tensorflow as tf

vectorizer = TextVectorization(max_tokens=10000, output_sequence_length=100)
text_ds = tf.data.Dataset.from_tensor_slices(all_tokens).batch(128) ## Read batches of 128 samples
vectorizer.adapt(text_ds)

In [28]:
## Print out top five words in the vocab
print(len(vectorizer.get_vocabulary())) ## We set max_tokens=10000
vectorizer.get_vocabulary()[:4]

10000


['', '[UNK]', 'the', 'to']

In [29]:
## Create a map to get the unique list of the vocabulary
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [30]:
from keras.preprocessing.sequence import pad_sequences

x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_test = vectorizer(np.array([[s] for s in test_samples])).numpy()

y_train = np.array(df_main_lower.Label, dtype="int")
y_test = np.array(test_labels, dtype="int")

In [31]:
np.unique(y_test)

array([0, 1, 2, 3, 4])

In [32]:
## Read the embeddings in the pretrained model (we are using the 100D version of GloVe)
import os
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        if len(coefs) == 100:
          embeddings_index[word] = coefs
        

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [33]:
## Create "embedding_matrix" to index our vocabulary using the GloVe model 
num_tokens = len(voc) 
embedding_dim = 100 ## 100 dimensions
hits = 0 ## number of words that were found in the pretrained model
misses = 0 ## number of words that were missing in the pretrained model

# Prepare embedding matrix for our word list
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 9751 words (249 misses)


In [34]:
## Define our embedding layer for the training model 
## We load our embedding_matrix as the initializer and set trainable to False to avoid retraining this layer

from keras.layers import Embedding
from keras.initializers import Constant

embedding_layer = Embedding(num_tokens, embedding_dim,
                            embeddings_initializer= Constant(embedding_matrix), 
                            trainable=False,
)

In [35]:
class_weight = {
    0 : 1,
    1 : 8,
    2 : 40,
    3 : 20,
    4 : 30
}

In [36]:
# !pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
# Training different models
from keras.layers import SimpleRNN, GRU, LSTM, Dropout
from keras.metrics import Precision, Recall
from keras import layers, Input, Model
from sklearn import metrics
import tensorflow_addons as tfa

int_sequences_input = Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)

x = layers.Bidirectional(layers.LSTM(200, return_sequences=True))(embedded_sequences)
x = layers.Bidirectional(layers.LSTM(200))(x)
x = Dropout(0.2)(x)
x = layers.Dense(50, activation="relu")(x)
preds = layers.Dense(len(classes), activation="softmax")(x)
model = Model(int_sequences_input, preds)

model.summary()
## Train the model 
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"])
model.fit(x_train, y_train, batch_size=128, class_weight=class_weight, epochs=20, validation_split=0.2)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 100)         1000000   
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 400)        481600    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 400)              961600    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 400)               0         
                                                                 
 dense_2 (Dense)             (None, 50)                2005

<keras.callbacks.History at 0x7fcf7960c610>

In [38]:
y = model.predict(x_test)

pred_class = []
for pred in y:
  pred_class.append(np.argmax(pred))

In [39]:
from sklearn.metrics import f1_score

F1 = f1_score(y_test, pred_class, average="weighted")

In [40]:
F1

0.6273016530945846