## This notebook performs dataset preprocessing and standardization on the generated datasets

### Imports

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect
import re
import nltk
from nltk.corpus import stopwords
from langdetect import detect
import spacy
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
import numpy as np

In [96]:
baseline= pd.read_csv('gen/baseline2.csv')
finetuned= pd.read_csv('gen/finetuned2.csv')
right_topics= pd.read_csv('politicians_data/topics_right.csv')
left_topics= pd.read_csv('politicians_data/topics_left.csv')
baseline_b = pd.read_csv('gen/baseline_b.csv')
finetuned_b = pd.read_csv('gen/finetuned_b.csv')

## Utils

In [97]:
def clean_topics(topics):
    
    new_topics=[]
    for topic in topics:
        topic_tokens= topic.split()
        if 'Contenuto:' in topic_tokens:
            index = topic_tokens.index('Contenuto:')
            topic_tokens[index]= ''
            topic= ' '.join(topic_tokens)
            topic=topic.strip()
            new_topics.append(topic)
        elif 'Argomento:' in topic_tokens:
            index = topic_tokens.index('Argomento:')
            topic_tokens[index]= ''
            topic= ' '.join(topic_tokens)
            topic=topic.strip()

            new_topics.append(topic)
        else:
            new_topics.append(topic)
    return new_topics

In [98]:
def clean_party(parties):
    new_parties=[]
    for party in parties:
        party_tokens= party.split()
        if ' '.join(party_tokens[:2])=='Partito politico:':
            party_tokens[:2]= ''
            party= ' '.join(party_tokens)
            party=party.strip()
            new_parties.append(party)
        elif ' '.join(party_tokens[:3])=='Partito di appartenenza:':
            party_tokens[:3]= ''
            party= ' '.join(party_tokens)
            party=party.strip()
            new_parties.append(party)
        elif ' '.join(party_tokens[:2])=='Ideologia politicaPartito:':
            party_tokens[:2]= ''
            party= ' '.join(party_tokens)
            party=party.strip()
            new_parties.append(party)
        else:
            new_parties.append(party)           
    return new_parties

In [99]:
def clean_sentiment(sentiments):
    new_sentiments=[]
    for sentiment in sentiments:
        sentiment_tokens= sentiment.split()
        if 'Accezione:' in sentiment_tokens:
            sentiment_tokens[0]= ''
            if 'Neutrale' in sentiment_tokens:
                index = sentiment_tokens.index('Neutrale')
                sentiment_tokens[index]= 'Neutro'
            sentiment= ' '.join(sentiment_tokens)
            sentiment=sentiment.strip()
            new_sentiments.append(sentiment)
        elif 'Neutrale' in sentiment_tokens:
                index = sentiment_tokens.index('Neutrale')
                sentiment_tokens[index]= 'Neutro'
                sentiment= ' '.join(sentiment_tokens)
                sentiment=sentiment.strip()
                new_sentiments.append(sentiment)
        else:
            new_sentiments.append(sentiment)           
    return new_sentiments

In [100]:
def get_macro_topic(df, macro_topics, flag=False): 
    Macros=[]
    for index, row in df.iterrows():
        topic= row['topic']
        #remove any extra spaces at the beginning and end of the topic
        macro_topic= macro_topics[macro_topics['TopicName'] == topic]
        macro_topic= macro_topic['MacroTopic'].values[0]
        Macros.append(macro_topic)
    return Macros

In [101]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words_italian = set(stopwords.words('italian'))
stop_words_english = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


nlp_it = spacy.load('it_core_news_sm')
nlp_en = spacy.load('en_core_web_sm')

def preprocess_text(text):
    try:
        # Detect language
        lang = detect(text)
    except:
        lang = 'unknown'
    
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    if lang == 'it':
        doc = nlp_it(text.lower())
        true_tokens = [str(token) for token in doc 
                      if token.text not in stop_words_italian and len(token.text) > 2 ]
    
    elif lang == 'en':
        doc = nlp_en(text.lower())
        true_tokens = [str(token) for token in doc 
                      if token.text not in stop_words_english and len(token.text) > 2 ]
    
    else: 
        tokens = word_tokenize(text.lower())
        true_tokens = [str(word) for word in tokens if len(word) > 2]
    
    return true_tokens



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/filippofocaccia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/filippofocaccia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/filippofocaccia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Let's first clean the topics column and standardise evrything in the baseline and on the finetuned data

In [102]:
base_old_topics= baseline['topic']
baseline_b_old_topics= baseline_b['topic']

finetuned_old_topics= finetuned['topic']
finetuned_b_old_topics= finetuned_b['topic']

new_base_topics= clean_topics(base_old_topics)
new_finetuned_topics= clean_topics(finetuned_old_topics)
new_base_topics_b= clean_topics(baseline_b_old_topics)
new_finetuned_topics_b= clean_topics(finetuned_b_old_topics)

baseline['topic']= new_base_topics
finetuned['topic']= new_finetuned_topics
baseline_b['topic']= new_base_topics_b
finetuned_b['topic']= new_finetuned_topics_b

### We then create a dataframe for all the macrotopics and microtopics

In [103]:
macro_topics = pd.concat([right_topics[['MacroTopic', 'TopicName']], left_topics[['MacroTopic', 'TopicName']]], axis=0, ignore_index=True)

### We then create new columns in the dataset with the macro topics for each micro topics

In [104]:
baseline_macro_topics= get_macro_topic(baseline,macro_topics)
finetuned_macro_topics= get_macro_topic(finetuned,macro_topics)
baseline_b_macro_topics= get_macro_topic(baseline_b,macro_topics,flag= True)
finetuned_b_macro_topics= get_macro_topic(finetuned_b,macro_topics)

baseline['MacroTopic']= baseline_macro_topics
finetuned['MacroTopic']= finetuned_macro_topics
baseline_b['MacroTopic']= baseline_b_macro_topics
finetuned_b['MacroTopic']= finetuned_b_macro_topics


## Let's see if we are able to predict the topics

In [105]:
baseline_copy = baseline.copy()
finetuned_copy = finetuned.copy()
baseline_b_copy = baseline_b.copy()
finetuned_b_copy = finetuned_b.copy()

train_data=pd.read_csv('politicians_data/fine_tune_data.csv')
train_copy = train_data.copy()


### We now apply category mappings on the train, baseline and finetuned datasets

In [106]:
train_copy['Macro_Topic'] = train_copy['Macro_Topic'].astype('category')

train_category_mapping = dict(enumerate(train_copy['Macro_Topic'].cat.categories))
print(train_category_mapping)
train_reverse_mapping = {v: k for k, v in train_category_mapping.items()}

train_copy['topic_mapping'] = train_copy['Macro_Topic'].map(train_reverse_mapping).astype('Int64')

finetuned_copy['topic_mapping'] = finetuned_copy['MacroTopic'].map(train_reverse_mapping)

finetuned_b_copy['topic_mapping'] = finetuned_b_copy['MacroTopic'].map(train_reverse_mapping)

baseline_copy['topic_mapping'] = baseline_copy['MacroTopic'].map(train_reverse_mapping)

baseline_b_copy['topic_mapping'] = baseline_b_copy['MacroTopic'].map(train_reverse_mapping)

{0: 'Governance e posizionamento politico', 1: 'Governance e relazioni internazionali', 2: 'Politica economica e sociale', 3: 'Questioni economiche e di sviluppo', 4: 'Relazioni internazionali e valori progressisti', 5: 'Valori nazionali e questioni sociali'}


### We then rename some columns for practicality

In [107]:
train_copy.rename(columns={'Content': 'Tweets'}, inplace=True)
finetuned_copy.rename(columns={'generated_tweet': 'Tweets','party': 'Party','tone': 'Sentiment'}, inplace=True)
baseline_copy.rename(columns={'generated_tweet': 'Tweets','party': 'Party','tone': 'Sentiment'}, inplace=True)
baseline_b_copy.rename(columns={'generated_tweets': 'Tweets','party': 'Party','tone': 'Sentiment'}, inplace=True)
finetuned_b_copy.rename(columns={'generated_tweets': 'Tweets','party': 'Party','tone': 'Sentiment'}, inplace=True)

### We then clean the party and sentiment column

In [108]:
b_parties= baseline_copy['Party']
f_parties= finetuned_copy['Party']

b_parties_b= baseline_b_copy['Party']
f_parties_b= finetuned_b_copy['Party']

new_base_parties= clean_party(b_parties)
new_finetuned_parties= clean_party(f_parties)
new_base_parties_b= clean_party(b_parties_b)
new_finetuned_parties_b= clean_party(f_parties_b)

baseline_copy['Party']= new_base_parties
finetuned_copy['Party']= new_finetuned_parties
baseline_b_copy['Party']= new_base_parties_b
finetuned_b_copy['Party']= new_finetuned_parties_b

In [109]:
b_sentiments= baseline_copy['Sentiment']
f_sentiments= finetuned_copy['Sentiment']
b_sentiments_b= baseline_b_copy['Sentiment']
f_sentiments_b= finetuned_b_copy['Sentiment']

new_base_sentiments= clean_sentiment(b_sentiments)
new_finetuned_sentiments= clean_sentiment(f_sentiments)
new_base_sentiments_b= clean_sentiment(b_sentiments_b)
new_finetuned_sentiments_b= clean_sentiment(f_sentiments_b)

baseline_copy['Sentiment']= new_base_sentiments
finetuned_copy['Sentiment']= new_finetuned_sentiments
baseline_b_copy['Sentiment']= new_base_sentiments_b
finetuned_b_copy['Sentiment']= new_finetuned_sentiments_b

### We also apply category mapping to both columns

In [110]:
train_copy['Sentiment'] = train_copy['Sentiment'].astype('category')
sentiment_mapping = dict(enumerate(train_copy['Sentiment'].cat.categories))
train_copy['Party'] = train_copy['Party'].astype('category')
print(sentiment_mapping)
reverse_sent_mapping = {v: k for k, v in sentiment_mapping.items()}


party_mapping = dict(enumerate(train_copy['Party'].cat.categories))
print(party_mapping)
reverse_party_mapping= {v: k for k, v in party_mapping.items()}


#apply the mapping to the sentiment and party column of finetuned and baseline
finetuned_copy['sentiment_mapping'] = finetuned_copy['Sentiment'].map(reverse_sent_mapping).astype('Int64')
baseline_copy['sentiment_mapping'] = baseline_copy['Sentiment'].map(reverse_sent_mapping).astype('Int64')

baseline_b_copy['party_mapping'] = baseline_b_copy['Party'].map(reverse_party_mapping).astype('Int64')
finetuned_b_copy['party_mapping'] = finetuned_b_copy['Party'].map(reverse_party_mapping).astype('Int64')

baseline_b_copy['sentiment_mapping'] = baseline_b_copy['Sentiment'].map(reverse_sent_mapping).astype('Int64')
finetuned_b_copy['sentiment_mapping'] = finetuned_b_copy['Sentiment'].map(reverse_sent_mapping).astype('Int64')

baseline_copy['party_mapping'] = baseline_copy['Party'].map(reverse_party_mapping).astype('Int64')
finetuned_copy['party_mapping'] = finetuned_copy['Party'].map(reverse_party_mapping).astype('Int64')

train_copy['party_mapping'] = train_copy['Party'].map(reverse_party_mapping).astype('Int64')
train_copy['sentiment_mapping'] = train_copy['Sentiment'].map(reverse_sent_mapping).astype('Int64')
train_copy.drop(columns=['Sentiment','Party'], inplace=True)


{0: 'Critico / Negativo', 1: 'Esortativo / Propaganda', 2: 'Generico', 3: 'Neutro / Informativo', 4: 'Supporto / Positivo'}
{0: 'Destra', 1: 'Sinistra'}


### We then preprocess training tweets

In [111]:
generated_tweets = train_copy['Tweets']
tqdm.pandas(desc="Processing tweets")
train_copy['processed_tweet'] = generated_tweets.progress_apply(lambda x: preprocess_text(x))

Processing tweets: 100%|██████████| 17241/17241 [02:59<00:00, 96.29it/s] 


In [113]:
train_copy['processed_tweet'] = train_copy['processed_tweet'].apply(lambda x: ' '.join(x))

In [114]:
baseline_target= baseline_copy['topic_mapping']
finetuned_target= finetuned_copy['topic_mapping']
baseline_b_target= baseline_b_copy['topic_mapping']
finetuned_b_target= finetuned_b_copy['topic_mapping']
train_target= train_copy['topic_mapping']

In [None]:
train_copy.drop(columns=['Dominant_Topic', 'Topic_Words', 'Macro_Topic', 'topic_mapping', 'Tweets'], inplace=True)

baseline_copy.drop(columns=['original_tweet', 'topic', 'Sentiment', 'Party', 'topic_mapping', 'MacroTopic'], inplace=True)

baseline_b_copy.drop(columns=['original_tweets', 'topic', 'Sentiment', 'Party', 'topic_mapping', 'MacroTopic'], inplace=True)

finetuned_b_copy.drop(columns=['original_tweets', 'topic', 'Sentiment', 'Party', 'topic_mapping', 'MacroTopic'], inplace=True)

finetuned_copy.drop(columns=['original_tweet', 'topic', 'Sentiment', 'Party', 'topic_mapping', 'MacroTopic'], inplace=True)


### We then preprocess test tweets

In [118]:
baseline_tweets= baseline_copy['Tweets']
baseline_b_tweets= baseline_b_copy['Tweets']
finetuned_tweets= finetuned_copy['Tweets']
finetuned_b_tweets= finetuned_b_copy['Tweets']

tqdm.pandas(desc="Processing baseline tweets")
baseline_copy['processed_tweet'] = baseline_tweets.progress_apply(lambda x: preprocess_text(x))

tqdm.pandas(desc="Processing finetuned tweets")
finetuned_copy['processed_tweet'] = finetuned_tweets.progress_apply(lambda x: preprocess_text(x))

tqdm.pandas(desc="Processing baseline_b tweets")
baseline_b_copy['processed_tweet'] = baseline_b_tweets.progress_apply(lambda x: preprocess_text(x))

tqdm.pandas(desc="Processing finetuned_b tweets")
finetuned_b_copy['processed_tweet'] = finetuned_b_tweets.progress_apply(lambda x: preprocess_text(x))

Processing baseline tweets: 100%|██████████| 3449/3449 [00:45<00:00, 75.09it/s]
Processing finetuned tweets: 100%|██████████| 3449/3449 [00:39<00:00, 87.84it/s]
Processing baseline_b tweets: 100%|██████████| 3449/3449 [00:57<00:00, 59.74it/s]
Processing finetuned_b tweets: 100%|██████████| 3449/3449 [00:41<00:00, 82.25it/s]


In [119]:
baseline_copy.drop(columns=['Tweets'], inplace=True)
baseline_b_copy.drop(columns=['Tweets'], inplace=True)
finetuned_b_copy.drop(columns=['Tweets'], inplace=True)
finetuned_copy.drop(columns=['Tweets'], inplace=True)

In [120]:
baseline_copy['processed_tweet'] = baseline_copy['processed_tweet'].apply(lambda x: ' '.join(x))
finetuned_copy['processed_tweet'] = finetuned_copy['processed_tweet'].apply(lambda x: ' '.join(x))
baseline_b_copy['processed_tweet'] = baseline_b_copy['processed_tweet'].apply(lambda x: ' '.join(x))
finetuned_b_copy['processed_tweet'] = finetuned_b_copy['processed_tweet'].apply(lambda x: ' '.join(x))

### We finally split all the datasets into right and left wings to be prepared for the classification task

In [121]:
left_wing_train = train_copy[train_copy['party_mapping'] == 1]
right_wing_train = train_copy[train_copy['party_mapping'] == 0]

In [None]:
left_wing_train.drop(columns=['party_mapping'], inplace=True)
right_wing_train.drop(columns=['party_mapping'], inplace=True)

In [123]:
train_target_right = train_target[train_target.isin([1, 3, 5])]
train_target_left = train_target[train_target.isin([0, 2, 4])]

In [None]:
baseline_b_copy_right = baseline_b_copy[baseline_b_copy['party_mapping'] == 0]
baseline_b_copy_right.drop(columns=['party_mapping'], inplace=True)
baseline_b_copy_left = baseline_b_copy[baseline_b_copy['party_mapping'] == 1]
baseline_b_copy_left.drop(columns=['party_mapping'], inplace=True)
finetuned_b_copy_right = finetuned_b_copy[finetuned_b_copy['party_mapping'] == 0]
finetuned_b_copy_right.drop(columns=['party_mapping'], inplace=True)
finetuned_b_copy_left = finetuned_b_copy[finetuned_b_copy['party_mapping'] == 1]
finetuned_b_copy_left.drop(columns=['party_mapping'], inplace=True)

In [125]:
baseline_b_target_right = baseline_b_target[baseline_b_target.isin([1, 3, 5])]
baseline_b_target_left = baseline_b_target[baseline_b_target.isin([0, 2, 4])]
finetuned_b_target_right = finetuned_b_target[finetuned_b_target.isin([1, 3, 5])]
finetuned_b_target_left = finetuned_b_target[finetuned_b_target.isin([0, 2, 4])]

In [None]:
baseline_copy_right= baseline_copy[baseline_copy['party_mapping'] == 0]
baseline_copy_right.drop(columns=['party_mapping'], inplace=True)
baseline_copy_left= baseline_copy[baseline_copy['party_mapping'] == 1]
baseline_copy_left.drop(columns=['party_mapping'], inplace=True)
finetuned_copy_right= finetuned_copy[finetuned_copy['party_mapping'] == 0]
finetuned_copy_right.drop(columns=['party_mapping'], inplace=True)
finetuned_copy_left= finetuned_copy[finetuned_copy['party_mapping'] == 1]
finetuned_copy_left.drop(columns=['party_mapping'], inplace=True)
baseline_target_right = baseline_target[baseline_target.isin([1, 3, 5])]
baseline_target_left = baseline_target[baseline_target.isin([0, 2, 4])]
finetuned_target_right = finetuned_target[finetuned_target.isin([1, 3, 5])]
finetuned_target_left = finetuned_target[finetuned_target.isin([0, 2, 4])]

In [133]:
baseline_copy_right.to_csv('files/baseline/baseline_right.csv', index=False)
baseline_copy_left.to_csv('files/baseline/baseline_left.csv', index=False)
baseline_target_right.to_csv('files/baseline/baseline_target_right.csv', index=False)
baseline_target_left.to_csv('files/baseline/baseline_target_left.csv', index=False)

In [134]:
baseline_b_copy_right.to_csv('files/baseline_b/baseline_b_right.csv', index=False)
baseline_b_copy_left.to_csv('files/baseline_b/baseline_b_left.csv', index=False)
baseline_b_target_right.to_csv('files/baseline_b/baseline_b_target_right.csv', index=False)
baseline_b_target_left.to_csv('files/baseline_b/baseline_b_target_left.csv', index=False)

In [135]:
finetuned_copy_right.to_csv('files/finetuned/finetuned_right.csv', index=False)
finetuned_copy_left.to_csv('files/finetuned/finetuned_left.csv', index=False)
finetuned_target_right.to_csv('files/finetuned/finetuned_target_right.csv', index=False)
finetuned_target_left.to_csv('files/finetuned/finetuned_target_left.csv', index=False)

In [136]:
finetuned_b_copy_right.to_csv('files/finetuned_b/finetuned_b_right.csv', index=False)
finetuned_b_copy_left.to_csv('files/finetuned_b/finetuned_b_left.csv', index=False)
finetuned_b_target_right.to_csv('files/finetuned_b/finetuned_b_target_right.csv', index=False)
finetuned_b_target_left.to_csv('files/finetuned_b/finetuned_b_target_left.csv', index=False)

In [137]:
right_wing_train.to_csv('files/train/train_right.csv', index=False)
left_wing_train.to_csv('files/train/train_left.csv', index=False)
train_target_right.to_csv('files/train/train_target_right.csv', index=False)
train_target_left.to_csv('files/train/train_target_left.csv', index=False)