In [None]:
# Install necessary libraries
!pip install nltk spacy scikit-learn
!python -m spacy download en_core_web_sm

import nltk
from nltk.corpus import stopwords
import spacy
import string
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download the stopwords from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Load the spaCy model for NER and POS tagging
nlp = spacy.load('en_core_web_sm')


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Upload datasets
from google.colab import files

# Prompt to upload the unzipped file
uploaded = files.upload()

Saving propaganda_train.tsv to propaganda_train.tsv
Saving propaganda_val.tsv to propaganda_val.tsv


In [None]:
# Load the training data
train_data = pd.read_csv('propaganda_train.tsv', sep='\t', header=None, names=['label', 'sentence'])

# Load the testing data
test_data = pd.read_csv('propaganda_val.tsv', sep='\t', header=None, names=['label', 'sentence'])


In [None]:
# Display the first few rows of the training data
print("Training Data:")
print(train_data.head(10))

Training Data:
             label                                           sentence
0            label                                  tagged_in_context
1   not_propaganda         No, <BOS> he <EOS> will not be confirmed. 
2   not_propaganda  This declassification effort <BOS> won’t make ...
3      flag_waving  The Obama administration misled the <BOS> Amer...
4   not_propaganda  “It looks like we’re capturing the demise of t...
5   not_propaganda           <BOS> Location: Westerville, Ohio <EOS> 
6  loaded_language  Hitler <BOS> annihilated <EOS> 400,000 Germans...
7   not_propaganda  A federal judge on Monday ordered U.S. immigra...
8   not_propaganda  <BOS> Kirstjen Nielsen (@SecNielsen) <EOS> Nov...
9            doubt  As noted above, at this point literally every ...


In [None]:
# Display the first few rows of the testing data
print("\nTesting Data:")
print(test_data.head(10))


Testing Data:
                       label  \
0                      label   
1             not_propaganda   
2  causal_oversimplification   
3   appeal_to_fear_prejudice   
4             not_propaganda   
5                 repetition   
6      name_calling,labeling   
7            loaded_language   
8             not_propaganda   
9                flag_waving   

                                            sentence  
0                                  tagged_in_context  
1  On average, between 300 and 600 infections are...  
2  Mostly because <BOS> the country would not las...  
3  Lyndon Johnson <BOS> gets Earl Warren and Sen....  
4           <BOS> You <EOS> may opt out at anytime.   
5  It must be exacted from him directly in order ...  
6  Is it any wonder that priests and laity alike ...  
7  Health workers have been asked to work with co...  
8       The Best of <BOS> Jacob <EOS> G. Hornberger   
9  Trump began his remarks by setting out <BOS> I...  


In [None]:
# Convert labels to binary: 'propaganda' or 'not_propaganda'
train_data['binary_label'] = train_data['label'].apply(lambda x: 'not_propaganda' if x == 'not_propaganda' else 'propaganda')
test_data['binary_label'] = test_data['label'].apply(lambda x: 'not_propaganda' if x == 'not_propaganda' else 'propaganda')


In [None]:
# Custom transformer for text preprocessing
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(self._preprocess)

    def _preprocess(self, text):
        # Lowercasing
        text = text.lower()

        # Remove <BOS> and <EOS> tokens
        text = re.sub('<BOS>|<EOS>', '', text)

        # Apply spaCy pipeline
        doc = nlp(text)

        # Remove stopwords and punctuation, and retain only relevant words (nouns, verbs, adjectives, etc.)
        words = [
            f"{token.text}_{token.pos_}" for token in doc
            if token.pos_ in {'NOUN', 'VERB', 'ADJ', 'ADV'}
            and token.text.lower() not in self.stop_words
            and token.text not in self.punctuation
        ]

        # Include named entities in the text
        entities = [f"{ent.text}_ENTITY" for ent in doc.ents]

        # Combine words and entities
        processed_text = words + entities

        # Return preprocessed text
        return ' '.join(processed_text)

In [None]:
# Initialize the text preprocessor
text_preprocessor = TextPreprocessor()

# Apply text preprocessing to the training and testing data
train_data['cleaned_sentence'] = text_preprocessor.transform(train_data['sentence'])
test_data['cleaned_sentence'] = text_preprocessor.transform(test_data['sentence'])


In [None]:
# Define the hyperparameter grid for RandomizedSearchCV
param_grid = {
    'vectorizer__max_features': [1000, 5000, 10000],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'classifier__C': [0.1, 1.0, 10.0]
}

In [None]:
# Create a pipeline with text preprocessing, TF-IDF vectorization, and Logistic Regression
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=300))
])

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(pipeline, param_distributions=param_grid, n_iter=10, scoring='accuracy', cv=5, random_state=42)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['cleaned_sentence'], train_data['binary_label'], test_size=0.2, random_state=42)

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best Hyperparameters:", random_search.best_params_)

Best Hyperparameters: {'vectorizer__ngram_range': (1, 1), 'vectorizer__max_features': 5000, 'classifier__C': 1.0}


In [None]:
# Apply cross-validation to the entire training set using the best pipeline
cv_scores = cross_val_score(random_search.best_estimator_, X_train, y_train, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Cross-Validation Scores: [0.64082687 0.65633075 0.71502591 0.71761658 0.67098446]
Mean CV Accuracy: 0.6801569131488399


In [None]:
# Predict the labels on the validation set using the best estimator from RandomizedSearchCV
val_predictions = random_search.best_estimator_.predict(X_val)

# Print out the classification report for the validation set
print("Validation Set Classification Report:")
print(classification_report(y_val, val_predictions))


Validation Set Classification Report:
                precision    recall  f1-score   support

not_propaganda       0.69      0.64      0.66       242
    propaganda       0.66      0.71      0.69       241

      accuracy                           0.67       483
     macro avg       0.68      0.68      0.67       483
  weighted avg       0.68      0.67      0.67       483



In [None]:
# Use the best model to predict on the test set
test_predictions = random_search.best_estimator_.predict(test_data['cleaned_sentence'])


In [None]:
# Print out the classification report for the test set
print("Test Set Classification Report:")
print(classification_report(test_data['binary_label'], test_predictions))

Test Set Classification Report:
                precision    recall  f1-score   support

not_propaganda       0.72      0.66      0.69       301
    propaganda       0.66      0.72      0.69       280

      accuracy                           0.69       581
     macro avg       0.69      0.69      0.69       581
  weighted avg       0.69      0.69      0.69       581

