In [None]:
import nltk
from nltk.corpus import stopwords
import spacy
import string
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from gensim.models import Word2Vec
import numpy as np

# Download the stopwords from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Load the spaCy model for NER and POS tagging
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Upload datasets
from google.colab import files

# Prompt to upload the unzipped file
uploaded = files.upload()


Saving propaganda_train.tsv to propaganda_train.tsv
Saving propaganda_val.tsv to propaganda_val.tsv


In [None]:
# Load the training data
train_data = pd.read_csv('propaganda_train.tsv', sep='\t', header=None, names=['label', 'sentence'])

# Load the testing data
test_data = pd.read_csv('propaganda_val.tsv', sep='\t', header=None, names=['label', 'sentence'])

# Remove the first row of both training and testing data
train_data = train_data.drop(0).reset_index(drop=True)
test_data = test_data.drop(0).reset_index(drop=True)

# Display the first few rows of the training data
print("Training Data:")
print(train_data.head(10))

# Display the first few rows of the testing data
print("\nTesting Data:")
print(test_data.head(10))

Training Data:
             label                                           sentence
0   not_propaganda         No, <BOS> he <EOS> will not be confirmed. 
1   not_propaganda  This declassification effort <BOS> won’t make ...
2      flag_waving  The Obama administration misled the <BOS> Amer...
3   not_propaganda  “It looks like we’re capturing the demise of t...
4   not_propaganda           <BOS> Location: Westerville, Ohio <EOS> 
5  loaded_language  Hitler <BOS> annihilated <EOS> 400,000 Germans...
6   not_propaganda  A federal judge on Monday ordered U.S. immigra...
7   not_propaganda  <BOS> Kirstjen Nielsen (@SecNielsen) <EOS> Nov...
8            doubt  As noted above, at this point literally every ...
9   not_propaganda  Britain doesn't need more hate even just for a...

Testing Data:
                       label  \
0             not_propaganda   
1  causal_oversimplification   
2   appeal_to_fear_prejudice   
3             not_propaganda   
4                 repetition   
5      n

In [None]:
# Encode labels as integers representing each of the nine classes
label_encoder = LabelEncoder()
train_data['encoded_label'] = label_encoder.fit_transform(train_data['label'])
test_data['encoded_label'] = label_encoder.transform(test_data['label'])

In [None]:
# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(self._preprocess)

    def _preprocess(self, text):
        # Lowercasing
        text = text.lower()

        # Remove <BOS> and <EOS> tokens
        text = re.sub('<BOS>|<EOS>', '', text)

        # Apply spaCy pipeline
        doc = nlp(text)

        # Remove stopwords and punctuation, and retain only relevant words (nouns, verbs, adjectives, etc.)
        words = [
            f"{token.text}_{token.pos_}" for token in doc
            if token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'}
            and token.text.lower() not in self.stop_words
            and token.text not in self.punctuation
        ]

        # Include named entities in the text
        entities = [f"{ent.text}_ENTITY" for ent in doc.ents]

        # Combine words and entities
        processed_text = words + entities

        # Return preprocessed text
        return ' '.join(processed_text)

# Initialize the text preprocessor
text_preprocessor = TextPreprocessor()


In [None]:
# Apply text preprocessing to the training and testing data
train_data['cleaned_sentence'] = text_preprocessor.transform(train_data['sentence'])
test_data['cleaned_sentence'] = text_preprocessor.transform(test_data['sentence'])

# Verify the new columns
print("\nTraining Data with Cleaned Sentences:")
print(train_data.head(10))
print(train_data.columns)

print("\nTesting Data with Cleaned Sentences:")
print(test_data.head(10))
print(test_data.columns)


Training Data with Cleaned Sentences:
             label                                           sentence  \
0   not_propaganda         No, <BOS> he <EOS> will not be confirmed.    
1   not_propaganda  This declassification effort <BOS> won’t make ...   
2      flag_waving  The Obama administration misled the <BOS> Amer...   
3   not_propaganda  “It looks like we’re capturing the demise of t...   
4   not_propaganda           <BOS> Location: Westerville, Ohio <EOS>    
5  loaded_language  Hitler <BOS> annihilated <EOS> 400,000 Germans...   
6   not_propaganda  A federal judge on Monday ordered U.S. immigra...   
7   not_propaganda  <BOS> Kirstjen Nielsen (@SecNielsen) <EOS> Nov...   
8            doubt  As noted above, at this point literally every ...   
9   not_propaganda  Britain doesn't need more hate even just for a...   

   encoded_label                                   cleaned_sentence  
0              7                                     confirmed_VERB  
1              7 

In [None]:
# Train a Word2Vec model on the cleaned sentences
sentences = [sentence.split() for sentence in train_data['cleaned_sentence']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Custom transformer to generate Word2Vec embeddings
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec_model):
        self.word2vec_model = word2vec_model

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        word2vec_features = np.array([
            np.mean([self.word2vec_model.wv[word] for word in sentence.split() if word in self.word2vec_model.wv]
                    or [np.zeros(self.word2vec_model.vector_size)], axis=0)
            for sentence in X
        ])
        return word2vec_features

In [None]:
# Initialize the custom transformer
word2vec_transformer = Word2VecTransformer(word2vec_model)

In [None]:
pipeline = ImbPipeline(steps=[
    ('word2vec_transformer', word2vec_transformer),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='multinomial'))
])

In [None]:
# Prepare data for the pipeline
X_train = train_data['cleaned_sentence']
y_train = train_data['encoded_label']
X_test = test_data['cleaned_sentence']
y_test = test_data['encoded_label']

In [None]:
# Define the parameter grid for RandomizedSearchCV
param_grid = {

    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'classifier__max_iter': [1000,1500]  # Different maximum iterations
}

In [None]:
# Perform randomized search for hyperparameter tuning
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, cv=5, scoring='f1_macro', random_state=42)
random_search.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'classifier__solver': 'lbfgs', 'classifier__max_iter': 1000, 'classifier__C': 100}


In [None]:
# Apply cross-validation to the entire training set using the best pipeline
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=5, scoring='f1_macro')

In [None]:
# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV F1 Score:", cv_scores.mean())


Cross-Validation Scores: [0.10828554 0.12634774 0.12623642 0.13570348 0.15668283]
Mean CV F1 Score: 0.13065120050062634


In [None]:
# Use the best model to predict on the test set
best_model = random_search.best_estimator_
test_predictions = best_model.predict(X_test)


In [None]:
# Print out the classification report for the test set
print("Test Set Classification Report:")
print(classification_report(y_test, test_predictions, target_names=label_encoder.classes_))

Test Set Classification Report:
                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.10      0.09      0.10        43
causal_oversimplification       0.08      0.16      0.11        31
                    doubt       0.16      0.26      0.20        38
exaggeration,minimisation       0.03      0.04      0.03        28
              flag_waving       0.19      0.44      0.26        39
          loaded_language       0.00      0.00      0.00        37
    name_calling,labeling       0.12      0.29      0.17        31
           not_propaganda       0.65      0.25      0.36       301
               repetition       0.04      0.06      0.05        32

                 accuracy                           0.21       580
                macro avg       0.15      0.18      0.14       580
             weighted avg       0.38      0.21      0.24       580



Result Without Cross Validation

In [None]:
# Print out the classification report
print(classification_report(y_test, predictions, target_names=label_encoder.classes_))

                           precision    recall  f1-score   support

 appeal_to_fear_prejudice       0.50      0.02      0.04        43
causal_oversimplification       0.10      0.42      0.16        31
                    doubt       0.14      0.29      0.19        38
exaggeration,minimisation       0.67      0.07      0.13        28
              flag_waving       0.10      0.10      0.10        39
          loaded_language       0.00      0.00      0.00        37
    name_calling,labeling       0.06      0.16      0.09        31
           not_propaganda       0.58      0.45      0.51       301
               repetition       0.12      0.03      0.05        32

                 accuracy                           0.30       580
                macro avg       0.25      0.17      0.14       580
             weighted avg       0.40      0.30      0.31       580

