In [1]:
import pandas as pd
import numpy as np
import os
import regex as re
import spacy as sy
import string
from urllib.parse import urlparse
from nltk.tokenize import TweetTokenizer

nlp_en = sy.load('en_core_web_sm')
all_stopwords = nlp_en.Defaults.stop_words

In [2]:
filename = '/Users/arjunkhanchandani/Desktop/semeval2023task3bundle-v4/data/en/dev-labels-subtask-2.txt'
dev_sub2_df = pd.read_csv(filename, header=None, sep='\t', names=['id', 'genre'])
print(dev_sub2_df.head())

          id                                              genre
0  820791520  Political,Fairness_and_equality,Policy_prescri...
1  821040551  Political,Capacity_and_resources,Policy_prescr...
2  813552066  Public_opinion,Policy_prescription_and_evaluat...
3  817176202  Political,External_regulation_and_reputation,P...
4  820419869  Public_opinion,Political,External_regulation_a...


In [3]:
dir_name = '/Users/arjunkhanchandani/Desktop/semeval2023task3bundle-v4/data/en/dev-articles-subtask-2'

article_df = pd.DataFrame()

numbers = list()
headlines = list()
articles = list()

for file in os.walk(dir_name):
    for filename in file[2]:
        
        number = re.findall('[0-9]+', filename)
        numbers.append(number[0])
        x = dir_name + '/' + filename
        # print(number)
        article = ''
        with open(x) as f:
            lines = f.readlines()
            for i in range(len(lines)):
                if i==0:
                    headline = lines[0]
                    headlines.append(headline)
                elif lines[i]=="\n":
                    continue
                else:
                    # print()
                    article = article + '' + lines[i]
        articles.append(article)
            
article_df['id'] = numbers
article_df['headlines'] = headlines
article_df['articles'] = articles

article_df = article_df.astype({'id': 'int32'})

In [4]:
dev_sub2_df

Unnamed: 0,id,genre
0,820791520,"Political,Fairness_and_equality,Policy_prescri..."
1,821040551,"Political,Capacity_and_resources,Policy_prescr..."
2,813552066,"Public_opinion,Policy_prescription_and_evaluat..."
3,817176202,"Political,External_regulation_and_reputation,P..."
4,820419869,"Public_opinion,Political,External_regulation_a..."
...,...,...
78,829815104,"Political,Crime_and_punishment,Fairness_and_eq..."
79,817147979,"Policy_prescription_and_evaluation,Political,L..."
80,813623212,"Policy_prescription_and_evaluation,Political,L..."
81,832948083,"Crime_and_punishment,Political,Legality_Consti..."


In [5]:
train_df = dev_sub2_df.set_index('id').join(article_df.set_index('id'))
print(train_df.isnull().sum())

train_df

genre        0
headlines    0
articles     0
dtype: int64


Unnamed: 0_level_0,genre,headlines,articles
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
820791520,"Political,Fairness_and_equality,Policy_prescri...",George III Lost America.\n,Theresa May Could Lose the United Kingdom Over...
821040551,"Political,Capacity_and_resources,Policy_prescr...",Queen Elizabeth Would Be Evacuated in Event of...,If Britain leaves the European Union without a...
813552066,"Public_opinion,Policy_prescription_and_evaluat...","You insult us, ambassador: Woody Johnson flagr...",With three months until Britain leaves the Eur...
817176202,"Political,External_regulation_and_reputation,P...","The British People, as Well as the Politicians...",The British Parliament just handed Prime Minis...
820419869,"Public_opinion,Political,External_regulation_a...",No break from Brexit: RT takes a look at lates...,As British MPs are told that their February br...
...,...,...,...
829815104,"Political,Crime_and_punishment,Fairness_and_eq...",Brussels Shows Its Fear\n,Hungarian Prime Minister Viktor Orban has been...
817147979,"Policy_prescription_and_evaluation,Political,L...",BREXIT OR BRINO: U.K.\n,Deep State Strikes Back—And Misses.\nGood Omen...
813623212,"Policy_prescription_and_evaluation,Political,L...",'Wishful thinking': Tory MPs dismiss May's hop...,Jeremy Hunt insisted Theresa May can still sal...
832948083,"Crime_and_punishment,Political,Legality_Consti...",Mueller concludes Russia-Trump probe; lawmaker...,Mueller concludes Russia-Trump probe; lawmaker...


In [6]:
def preprocessing(x, y, df):
    
    pos_tags_final_text = list()
    er_final_text = list()
    preprocessed_text = list()

    for x in df.loc[:,y]:

        tokenizer = TweetTokenizer()
        #tokenizing
        doc = tokenizer.tokenize(x)
        
        # removing links
        tokens = [token for token in doc if not urlparse(token).scheme]
        x = ' '.join(tokens)
        doc = nlp_en(x)
        
        # removing punctuation and white space
        tokens = [token.orth_ for token in doc if not token.is_punct | token.is_space]    
        x = ' '.join(tokens)
        
        # lower case
        x = x.lower()
        doc = nlp_en(x)

        # lemmatization
        tokens = [word.lemma_ for word in doc]   
        x = ' '.join(tokens)
        doc = nlp_en(x)  
        
        # removing punctuation and white space
        tokens = [token.orth_ for token in doc if not token.is_punct | token.is_space]    
        x = ' '.join(tokens)
        doc = nlp_en(x)
        
        # removing individual letters
        tokens = [word.text for word in doc if len(word)>=2]
        x = ' '.join(tokens)  
        # print(x)
        doc = nlp_en(x)
        
        # removing stop words
        tokens = [word for word in doc if not word in all_stopwords]
        list_of_strings  = [i.text for i in tokens]
        x = ' '.join(list_of_strings)
        doc = nlp_en(x)
        
        # Part of speech tagging
        pos_tags = [(i, i.tag_) for i in doc]
        pos_tags_final_text.append(pos_tags)
        
        # entity recognition tagging
        er =  [(i, i.label_, i.label) for i in doc.ents] 
        er_final_text.append(er)
        
        preprocessed_text.append(x)
        
    return pos_tags_final_text, er_final_text, preprocessed_text

In [7]:
pos_tags_final_headlines, er_final_headlines, preprocessed_headlines = preprocessing(headline, 'headlines', train_df)

train_df['preprocessed_headlines'] = preprocessed_headlines
train_df['pos_tags_headlines'] = pos_tags_final_headlines
train_df['er_tags_headlines'] = er_final_headlines

In [8]:
pos_tags_final_articles, er_final_articles, preprocessed_articles = preprocessing(article, 'articles', train_df)

train_df['preprocessed_articles'] = preprocessed_articles
train_df['pos_tags_articles'] = pos_tags_final_articles
train_df['er_tags_articles'] = er_final_articles

In [9]:
train_df.head()

Unnamed: 0_level_0,genre,headlines,articles,preprocessed_headlines,pos_tags_headlines,er_tags_headlines,preprocessed_articles,pos_tags_articles,er_tags_articles
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
820791520,"Political,Fairness_and_equality,Policy_prescri...",George III Lost America.\n,Theresa May Could Lose the United Kingdom Over...,george iii lose america,"[(george, NNP), (iii, NNP), (lose, VB), (ameri...","[((george, iii), PERSON, 380), ((america), GPE...",theresa may could lose the united kingdom over...,"[(theresa, NN), (may, MD), (could, MD), (lose,...","[((the, united, kingdom), GPE, 384), ((the, eu..."
821040551,"Political,Capacity_and_resources,Policy_prescr...",Queen Elizabeth Would Be Evacuated in Event of...,If Britain leaves the European Union without a...,queen elizabeth would be evacuate in event of ...,"[(queen, NNP), (elizabeth, NNP), (would, MD), ...","[((elizabeth), PERSON, 380), ((brexit, riot, r...",if britain leave the european union without tr...,"[(if, IN), (britain, NNP), (leave, VBP), (the,...","[((britain), GPE, 384), ((the, european, union..."
813552066,"Public_opinion,Policy_prescription_and_evaluat...","You insult us, ambassador: Woody Johnson flagr...",With three months until Britain leaves the Eur...,you insult us ambassador woody johnson flagran...,"[(you, PRP), (insult, VBP), (us, NNP), (ambass...","[((woody, johnson), PERSON, 380), ((peter), PE...",with three month until britain leave the europ...,"[(with, IN), (three, CD), (month, NN), (until,...","[((three, month), DATE, 391), ((britain), GPE,..."
817176202,"Political,External_regulation_and_reputation,P...","The British People, as Well as the Politicians...",The British Parliament just handed Prime Minis...,the british people as well as the politician d...,"[(the, DT), (british, JJ), (people, NNS), (as,...","[((british), NORP, 381)]",the british parliament just hand prime ministe...,"[(the, DT), (british, JJ), (parliament, NNP), ...","[((british), NORP, 381), ((british), NORP, 381..."
820419869,"Public_opinion,Political,External_regulation_a...",No break from Brexit: RT takes a look at lates...,As British MPs are told that their February br...,no break from brexit rt take look at late deve...,"[(no, DT), (break, NN), (from, IN), (brexit, N...","[((brexit, rt), ORG, 383)]",as british mp be tell that their february brea...,"[(as, IN), (british, NNP), (mp, NNP), (be, VB)...","[((british), NORP, 381), ((february), DATE, 39..."


In [10]:
# converting tweets_df into a csv file
filename = '/Users/arjunkhanchandani/Desktop/semeval2023task3bundle-v4/dev_subtask_2.csv'
train_df.to_csv(filename, index=True)