In [None]:
import nltk
from nltk.corpus import stopwords
import spacy
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.utils import to_categorical
from gensim.models import Word2Vec
import numpy as np

# Download the stopwords from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Load the spaCy model for NER and POS tagging
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Upload datasets
from google.colab import files

# Prompt to upload the unzipped file
uploaded = files.upload()


Saving propaganda_train.tsv to propaganda_train.tsv
Saving propaganda_val.tsv to propaganda_val.tsv


In [None]:
# Load the training data
train_data = pd.read_csv('propaganda_train.tsv', sep='\t', header=None, names=['label', 'sentence'])

# Load the testing data
test_data = pd.read_csv('propaganda_val.tsv', sep='\t', header=None, names=['label', 'sentence'])

# Remove the first row of both training and testing data
train_data = train_data.drop(0).reset_index(drop=True)
test_data = test_data.drop(0).reset_index(drop=True)

# Display the first few rows of the training data
print("Training Data:")
print(train_data.head(10))

# Display the first few rows of the testing data
print("\nTesting Data:")
print(test_data.head(10))

Training Data:
             label                                           sentence
0   not_propaganda         No, <BOS> he <EOS> will not be confirmed. 
1   not_propaganda  This declassification effort <BOS> won’t make ...
2      flag_waving  The Obama administration misled the <BOS> Amer...
3   not_propaganda  “It looks like we’re capturing the demise of t...
4   not_propaganda           <BOS> Location: Westerville, Ohio <EOS> 
5  loaded_language  Hitler <BOS> annihilated <EOS> 400,000 Germans...
6   not_propaganda  A federal judge on Monday ordered U.S. immigra...
7   not_propaganda  <BOS> Kirstjen Nielsen (@SecNielsen) <EOS> Nov...
8            doubt  As noted above, at this point literally every ...
9   not_propaganda  Britain doesn't need more hate even just for a...

Testing Data:
                       label  \
0             not_propaganda   
1  causal_oversimplification   
2   appeal_to_fear_prejudice   
3             not_propaganda   
4                 repetition   
5      n

In [None]:
# Encode labels as integers representing each of the nine classes
label_encoder = LabelEncoder()
train_data['encoded_label'] = label_encoder.fit_transform(train_data['label'])
test_data['encoded_label'] = label_encoder.transform(test_data['label'])

In [None]:
# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(self._preprocess)

    def _preprocess(self, text):
        # Lowercasing
        text = text.lower()

        # Remove <BOS> and <EOS> tokens
        text = re.sub('<BOS>|<EOS>', '', text)

        # Apply spaCy pipeline
        doc = nlp(text)

        # Remove stopwords and punctuation, and retain only relevant words (nouns, verbs, adjectives, etc.)
        words = [
            f"{token.text}_{token.pos_}" for token in doc
            if token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'}
            and token.text.lower() not in self.stop_words
            and token.text not in self.punctuation
        ]

        # Include named entities in the text
        entities = [f"{ent.text}_ENTITY" for ent in doc.ents]

        # Combine words and entities
        processed_text = words + entities

        # Return preprocessed text
        return ' '.join(processed_text)

# Initialize the text preprocessor
text_preprocessor = TextPreprocessor()


In [None]:
# Apply text preprocessing to the training and testing data
train_data['cleaned_sentence'] = text_preprocessor.transform(train_data['sentence'])
test_data['cleaned_sentence'] = text_preprocessor.transform(test_data['sentence'])

# Verify the new columns
print("\nTraining Data with Cleaned Sentences:")
print(train_data.head(10))
print(train_data.columns)

print("\nTesting Data with Cleaned Sentences:")
print(test_data.head(10))
print(test_data.columns)


Training Data with Cleaned Sentences:
             label                                           sentence  \
0   not_propaganda         No, <BOS> he <EOS> will not be confirmed.    
1   not_propaganda  This declassification effort <BOS> won’t make ...   
2      flag_waving  The Obama administration misled the <BOS> Amer...   
3   not_propaganda  “It looks like we’re capturing the demise of t...   
4   not_propaganda           <BOS> Location: Westerville, Ohio <EOS>    
5  loaded_language  Hitler <BOS> annihilated <EOS> 400,000 Germans...   
6   not_propaganda  A federal judge on Monday ordered U.S. immigra...   
7   not_propaganda  <BOS> Kirstjen Nielsen (@SecNielsen) <EOS> Nov...   
8            doubt  As noted above, at this point literally every ...   
9   not_propaganda  Britain doesn't need more hate even just for a...   

   encoded_label                                   cleaned_sentence  
0              7                                     confirmed_VERB  
1              7 

In [None]:
# Tokenize the cleaned sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['cleaned_sentence'])
X_train_seq = tokenizer.texts_to_sequences(train_data['cleaned_sentence'])
X_test_seq = tokenizer.texts_to_sequences(test_data['cleaned_sentence'])

In [None]:
# Pad the sequences
max_sequence_length = max(len(seq) for seq in X_train_seq)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

In [None]:
# Extract the vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Train a Word2Vec model on the cleaned sentences
sentences = [sentence.split() for sentence in train_data['cleaned_sentence']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Create an embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

LSTM Model

In [None]:
# Input layer
word_input = Input(shape=(max_sequence_length,), dtype='int32')

# Embedding layer
word_embedding = Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           weights=[embedding_matrix],
                           input_length=max_sequence_length,
                           trainable=True)(word_input)

# LSTM layers
lstm_out = LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(word_embedding)
lstm_out = LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2)(lstm_out)

# Dense layer
dense_out = Dense(64, activation='relu')(lstm_out)
dense_out = Dropout(0.5)(dense_out)

# Output layer
output = Dense(len(label_encoder.classes_), activation='softmax')(dense_out)

In [None]:
# Compile the model
model = Model(inputs=word_input, outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Convert labels to categorical
y_train_cat = to_categorical(train_data['encoded_label'])
y_test_cat = to_categorical(test_data['encoded_label'])

In [None]:
# Train the model
model.fit(X_train_pad, y_train_cat, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x793230dbac20>

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test_cat)
print(f"Test accuracy: {test_accuracy}")

Test accuracy: 0.39827585220336914


In [None]:
# Predict the labels on the test set
y_pred = model.predict(X_test_pad)
y_pred_classes = np.argmax(y_pred, axis=1)



In [None]:
# Print out the classification report
print(classification_report(test_data['encoded_label'], y_pred_classes, target_names=label_encoder.classes_))

                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.05      0.02      0.03        43
causal_oversimplification       0.17      0.19      0.18        31
                    doubt       0.33      0.08      0.13        38
exaggeration,minimisation       0.07      0.04      0.05        28
              flag_waving       0.23      0.46      0.31        39
          loaded_language       0.05      0.05      0.05        37
    name_calling,labeling       0.19      0.16      0.18        31
           not_propaganda       0.70      0.63      0.66       301
               repetition       0.07      0.19      0.10        32

                 accuracy                           0.40       580
                macro avg       0.21      0.20      0.19       580
             weighted avg       0.44      0.40      0.41       580

