# Import libraries

In [1]:
import nltk
import stanza
import ast
from afinn import Afinn
afinn = Afinn()
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.corpus import verbnet as vn
from nltk.corpus import opinion_lexicon
from nltk.wsd import lesk
from nltk.corpus import wordnet
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import openpyxl

# Import Data

* datasets_cleaned/test_cleaned
* datasets_cleaned/train_cleaned
* datasets_cleaned/valid_cleaned



Need to shuffle the data because the rows are sorted by labels.

In [2]:
column_names = ["Sentence", "Ent1", "Ent2", "Label"]

df_train = pd.read_csv('C:/Users/Anastasiia Belkina/MANNHEIM/MASTER_THESIS_CODE/Real_News_Data_preparation/datasets_only_same_ents_clean/train.txt', sep='\t', header=None, names=column_names)
df_train_dropped = df_train.drop(columns=["Ent1", "Ent2"])
df_train_shuffled = df_train_dropped.sample(frac=1).reset_index(drop=True)

df_valid = pd.read_csv('C:/Users/Anastasiia Belkina/MANNHEIM/MASTER_THESIS_CODE/Real_News_Data_preparation/datasets_only_same_ents_clean/valid.txt', sep='\t', header=None, names=column_names)
df_valid_dropped = df_valid.drop(columns=["Ent1", "Ent2"])
df_valid_shuffled = df_valid_dropped.sample(frac=1).reset_index(drop=True)

df_test = pd.read_csv('C:/Users/Anastasiia Belkina/MANNHEIM/MASTER_THESIS_CODE/Real_News_Data_preparation/datasets_only_same_ents_clean/test.txt', sep='\t', header=None, names=column_names)
df_test_dropped = df_test.drop(columns=["Ent1", "Ent2"])
df_test_shuffled = df_test_dropped.sample(frac=1).reset_index(drop=True)

# Preprocessing

How to Sequence These Steps
The preprocessing pipeline generally follows this order:
- Tokenization
- POS Tagging
- Named Entity Recognition (NER)
- Dependency Parsing (including verb-object detection)

## Step 1: Tokenization, POS Tagging, and Named Entity Recognition (NER) using Stanza

In [3]:
# Initialize Stanza pipeline
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma,depparse,ner')

# Tokenization, POS tagging, and NER
def process_text(text):
    doc = nlp(text)
    tokens = [(word.text, word.upos) for sent in doc.sentences for word in sent.words]
    entities = [(ent.text, ent.type) for ent in doc.entities]
    return tokens, entities

df_train_shuffled['tokens_pos'], df_train_shuffled['entities'] = zip(*df_train_shuffled['Sentence'].apply(process_text))
df_valid_shuffled['tokens_pos'], df_valid_shuffled['entities'] = zip(*df_valid_shuffled['Sentence'].apply(process_text))
df_test_shuffled['tokens_pos'], df_test_shuffled['entities'] = zip(*df_test_shuffled['Sentence'].apply(process_text))

2024-09-11 13:53:50 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-09-11 13:53:50 INFO: Downloaded file to C:\Users\Anastasiia Belkina\stanza_resources\resources.json
2024-09-11 13:53:51 INFO: Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| pos       | combined_charlm           |
| lemma     | combined_nocharlm         |
| depparse  | combined_charlm           |
| ner       | ontonotes-ww-multi_charlm |

2024-09-11 13:53:51 INFO: Using device: cpu
2024-09-11 13:53:51 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-09-11 13:53:52 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-09-11 13:53:52 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: st

## Step 2: Dependency Parsing using Stanza

In [8]:
def dependency_parse(text):
    doc = nlp(text)
    dependencies = [(word.text, word.head, word.deprel) for sent in doc.sentences for word in sent.words]
    return dependencies

df_train_shuffled['dependencies'] = df_train_shuffled['Sentence'].apply(dependency_parse)
df_valid_shuffled['dependencies'] = df_valid_shuffled['Sentence'].apply(dependency_parse)
df_test_shuffled['dependencies'] = df_test_shuffled['Sentence'].apply(dependency_parse)

## Checking the preprocessed data

In [9]:
df_train_shuffled.head()

Unnamed: 0,Sentence,Label,tokens_pos,entities,dependencies
0,a. m. Initial eyewitness accounts of such inci...,0,"[(a., X), (m., NOUN), (Initial, ADJ), (eyewitn...","[(British, NORP), (Cox’s, PERSON)]","[(a., 10, dep), (m., 10, nsubj), (Initial, 5, ..."
1,"Shortly after the beginning of the attack, the...",1,"[(Shortly, ADV), (after, ADP), (the, DET), (be...","[(Talibans, NORP), (Zabihullah Mujahid, PERSON)]","[(Shortly, 4, advmod), (after, 4, case), (the,..."
2,Judge Pryor initially supported Judge Moore bu...,0,"[(Judge, NOUN), (Pryor, PROPN), (initially, AD...","[(Pryor, PERSON), (Moore, PERSON)]","[(Judge, 4, nsubj), (Pryor, 1, flat), (initial..."
3,Trump also expects to receive a major new fina...,3,"[(Trump, PROPN), (also, ADV), (expects, VERB),...","[(Trump, PERSON), (the United States, GPE), (t...","[(Trump, 3, nsubj), (also, 3, advmod), (expect..."
4,just decentralisation.Mr Purcell praised the C...,1,"[(just, ADV), (decentralisation, NOUN), (., PU...","[(Purcell, PERSON), (Coalition, ORG)]","[(just, 2, advmod), (decentralisation, 0, root..."


In [10]:
df_valid_shuffled.head()

Unnamed: 0,Sentence,Label,tokens_pos,entities,dependencies
0,U. S. equities started mostly flat on Wednesda...,0,"[(U., PROPN), (S., PROPN), (equities, NOUN), (...","[(U. S., ORG), (Wednesday, DATE), (Republican,...","[(U., 3, compound), (S., 3, compound), (equiti..."
1,: Rubio supporters cheer Jeb!,0,"[(:, PUNCT), (Rubio, PROPN), (supporters, NOUN...","[(Rubio, PERSON), (Jeb, PERSON)]","[(:, 4, punct), (Rubio, 3, compound), (support..."
2,Qatars announcement of plans to boost LNG outp...,2,"[(Qatars, PROPN), (announcement, NOUN), (of, A...","[(Qatars, ORG), (LNG, ORG), (Gulf, LOC), (Saud...","[(Qatars, 2, compound), (announcement, 10, nsu..."
3,A man who said that he was the one later detai...,3,"[(A, DET), (man, NOUN), (who, PRON), (said, VE...","[(Frank Flack, PERSON), (Edwards, PERSON)]","[(A, 2, det), (man, 19, nsubj), (who, 4, nsubj..."
4,"player.The entire game was intensely played, c...",0,"[(player, NOUN), (., PUNCT), (The, DET), (enti...","[(Priory, ORG), (Buck Matthews, PERSON)]","[(player, 0, root), (., 1, punct), (The, 3, de..."


In [11]:
df_test_shuffled.head()

Unnamed: 0,Sentence,Label,tokens_pos,entities,dependencies
0,In November 2015 Donald Trump proclaimed to a ...,0,"[(In, ADP), (November, PROPN), (2015, NUM), (D...","[(November 2015, DATE), (Donald Trump, PERSON)...","[(In, 3, case), (November, 6, obl), (2015, 2, ..."
1,Berlusconi's allies accused the Milan magistra...,3,"[(Berlusconi, PROPN), ('s, PART), (allies, NOU...","[(Berlusconi's, NORP), (Milan, GPE)]","[(Berlusconi, 3, nmod:poss), ('s, 1, case), (a..."
2,"But when Laila called, she was told that Arlin...",0,"[(But, CCONJ), (when, ADV), (Laila, PROPN), (c...","[(Laila, PERSON), (Arlington, GPE)]","[(But, 8, cc), (when, 4, advmod), (Laila, 4, n..."
3,Results for the 801 likely Democratic primary ...,0,"[(Results, NOUN), (for, ADP), (the, DET), (801...","[(801, CARDINAL), (Democratic, NORP), (plus or...","[(Results, 9, nsubj), (for, 8, case), (the, 8,..."
4,The continuing acceleration of the number of S...,0,"[(The, DET), (continuing, VERB), (acceleration...","[(Syrian, NORP), (Obama, PERSON)]","[(The, 3, det), (continuing, 3, amod), (accele..."


## Save the data in the file

In [13]:
# Saving df_train_shuffled to a text file with tab separators
df_train_shuffled.to_csv('datasets_preprocessed/df_train_shuffled.txt', sep='\t', index=False, header=True)

# Saving df_valid_shuffled to a text file with tab separators
df_valid_shuffled.to_csv('datasets_preprocessed/df_valid_shuffled.txt', sep='\t', index=False, header=True)

# Saving df_valid_shuffled to a text file with tab separators
df_test_shuffled.to_csv('datasets_preprocessed/df_test_shuffled.txt', sep='\t', index=False, header=True)