# Data Cleaning

In [None]:
import pandas as pd 
import numpy as np
import re

In [None]:
# import tweets from file 
df = pd.read_csv('/Users/.../csv_files/merged.csv')
# remove retweets
db = df[[not i for i in df['is_retweet']]]
# add date-like format 
db['created_at'] = pd.to_datetime(db['created_at'])
# remove tweets that got no likes or retweets or comments 
db = db[(db[['reply_count', 'retweet_count', 'like_count']].sum(axis = 1) != 0)]
def is_whitespace_or_url_only(tweet):
    # Regex to match URLs
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    # Remove all URLs from the tweet
    tweet_without_urls = re.sub(url_pattern, '', tweet)
    # Check if the remaining tweet is only whitespace
    return tweet_without_urls.strip() != ''
db = db[db['fulltext'].apply(is_whitespace_or_url_only)]
# remove caps which might get identified as a topic 
db['fulltext'] = db['fulltext'].str.lower()
# news outlet list 
news_outlets = ['il_piccolo', 'infoitinterno', 'TgrRaiFVG', 'Open_gol', 'TRIESTE_news', 'Ansa_Fvg', 'Corriere', 
                'MediasetTgcom24', 'La7tv', 'localteamtv', 'ComunediTrieste', 'Telequattro', 'messveneto', 
               'Gazzettino', 'Radio1Rai', 'tempoweb', 'Telefriuli1', 'SkyTG24', 'Agenzia_Ansa', 'ilgiornale',
               'fanpage', 'IlFriuli', 'DomaniGiornale', 'DiscoverTrieste', 'ImolaOggi', 'repubblica', 'ilfoglio_it',
               'RaCapodistria', 'informatrieste', 'Agenzia_Italia', 'UdineseTV', 'VisioneTv', 'neXtquotidiano', 
               'fattoquotidiano', 'ilmessaggeroit', 'HuffPostItalia', 'Roma_H_24', 'RaiNews', 'RadioRadioWeb', 'RadioGenova',
               'localteamtv', 'byoblu', 'RadioSavana', 'fanpage', 'MediasetTgcom24', 'LaStampa', 'TgLa7',
               'Libero_official']
# remove from data 
db = db[[True if not i in news_outlets else False for i in db['username']]]
# drop text duplicates 
db = db.loc[db['fulltext'].drop_duplicates().index]

# Topic Modelling 

In [None]:
from bertopic import BERTopic
import openai
from bertopic.representation import OpenAI
from sentence_transformers import SentenceTransformer

In [None]:
# Read the API key from the text file
key_location = '/Users/.../openAI_key.txt'
with open(key_location, 'r') as file:
    api_key = file.read().strip()  # Remove any leading/trailing whitespace
    
# Create the OpenAI client with the API key
client = openai.OpenAI(api_key=api_key)
representation_model = OpenAI(client, model="gpt-3.5-turbo", chat=True)
# select embedding model 
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
# select BERTopic hyperparameters 
topic_model = BERTopic(embedding_model=embedding_model,
                       representation_model=representation_model, 
                       verbose=True,
                       min_topic_size=70,
                       calculate_probabilities = True)
# estimate model
topics, probs = topic_model.fit_transform(db['fulltext'])

# Import meta-topic classification from drive

In [None]:
from google.oauth2.service_account import Credentials
import gspread

In [None]:
# load credentials 
scopes = [
    'https://www.googleapis.com/auth/spreadsheets',
    'https://www.googleapis.com/auth/drive'
]
credentials = Credentials.from_service_account_file(
    'credentials.json',
    scopes=scopes)
# access google drive API
gc = gspread.authorize(credentials)
# open spreadsheet 
spreadsheet = gc.open("twitter_topics_reduced")
# Open the worksheet by its name
worksheet = spreadsheet.worksheet('meta')
# Get all the data from the worksheet
meta_topics = worksheet.get_all_values()
# pass to dataframe correctly 
meta_topics = pd.DataFrame(meta_topics[1:], columns = meta_topics[0])

# Create composite dataset with added opinion and topics

In [None]:
# remove empty topic
meta_topics = meta_topics[meta_topics['Topic'] != '']
# change type 
meta_topics['Topic'] = meta_topics['Topic'].astype(int)
# merge meta-topics with predicted topics from data
stage1 = pd.merge(pd.Series(topic_model.topics_, name = 'Topic'), meta_topics, on = 'Topic', how = 'left')
# append topic probabilities and meta-topic to main dataframe
stage2 = pd.concat([db.reset_index(), stage1, pd.DataFrame(topic_model.probabilities_)], axis = 1)
# drop count column
stage2 = stage2.drop('Count', axis = 1)
# load predicted opinion file 
opinion = pd.read_csv('/Users/..../csv_files/opinion.csv')
# remove added columns 
opinion = opinion[['0', '1']]
# change name 
opinion.columns = ['support', 'neutral or against']
# take original dataset 
stage3 = df[df['is_retweet'] == False].reset_index()
# concat with opinion
stage4 = pd.concat([stage3, opinion], axis = 1)
# merge opinion with topic 
dd = pd.merge(stage4, stage2, on = 'index', how = 'outer')
# change columns to string format 
dd.columns = [str(i) for i in dd.columns]
# remove doubled columns
columns_to_keep = [col for col in dd.columns if not col.endswith('_y')]
dd = dd[columns_to_keep]
dd.columns = dd.columns.str.replace('_x$', '', regex=True)

# Prevalence of toipics depending on opinion 

In [None]:
# add explicit column for opinion to dataframe (better name)
dd['opinion'] = dd[['support', 'neutral or against']].apply(np.argmax, axis = 1)
# group by topic for each opinion
topic_opinion = dd.groupby('opinion')['Topic'].agg(list).apply(lambda x: pd.Series(x).value_counts()).transpose()[1:]
# normalized to sum 1
stage1 = topic_opinion.div(topic_opinion.apply(sum))
# merge with meta-topics dataframe
to_mplt = pd.merge(stage1, meta_topics, left_index = True, right_on = 'Topic')
# aggregate for meta topic depending on opinion 
to_plot = pd.concat([to_mplt.groupby('Meta')[0].agg(sum), to_mplt.groupby('Meta')[1].agg(sum)], axis = 1)
# drop unwanted topics 
to_plot = to_plot.drop('noise')
# obtain differenced
to_plot['diff'] = to_plot[0] - to_plot[1]
# sort 
to_plot = to_plot.sort_values('diff')

# For explorative k-means clustering

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
# obtain usernames of pro non-compliance users 
noncompliers = dd.groupby('username')['opinion'].value_counts(normalize=True).unstack(fill_value = 0).apply(lambda x: x[0] > x[1], axis = 1)
# pass as list 
noncompliers = list(noncompliers[noncompliers].index)
# select data of non-compliers only 
to_kmeans = dd[[True if i in noncompliers else False for i in dd['username']]]
# select Friuli-related keywords 
fvg = ['Trieste', 'Friuli', 'Trieste, Friuli-Venezia Giulia', 'FVG']
# Normalize the substrings by removing non-alphanumeric characters and converting to lowercase
fvg_normalized = [re.sub(r'\W+', '', s).lower() for s in fvg]
# Function to check if any fvg substring is in the original string
def contains_fvg(s):
    normalized_s = re.sub(r'\W+', '', s).lower()
    return any(sub in normalized_s for sub in fvg_normalized)
# Apply the function to each string
results = [contains_fvg(s) for s in to_kmeans['location'].value_counts().index]
# add location column
fvg_loc = to_kmeans['location'].value_counts().index[results]
# change name of location column for better interpretability 
to_kmeans['location'] = ['in-friuli' if i in fvg_loc else 'out-friuli' for i in to_kmeans['location']]
# remove nans 
to_kmeans['type_1'] = to_kmeans['type_1'].fillna('')
to_kmeans['Meta'] = to_kmeans['Meta'].fillna('')
# define helepr 
def stacker(colname, to_drop = ''):
    result = to_kmeans.groupby('username')[colname].value_counts(normalize=True).unstack(fill_value=0)
    return result.drop(to_drop, axis=1)
# process data to obtain summary for users 
tokm_gr = pd.concat([
           stacker('location', to_drop = 'out-friuli'), # otbain locaiton summary 
           to_kmeans.groupby('username')['retweet_count'].agg(np.median),  # obtain median retweet count
           stacker('best_result', to_drop = ['joy', 'fear']), # obtain frequency of anger and sadness 
           stacker('Meta', to_drop = ['', 'noise', 'genova port joins the protests', 'port blocking', 
                                     'port workers and workers union', 'prostests and Rome', 
                                      'protests and fasism', 'protests and media journalists', 
                                      'protests cause covid outbreak', 'puzzer', 'square of trieste ',
                                      'troubled protests and possible issues with protests ',
                                      'port workers announce port block', 'foreign hidden influences on the port'
                                     ]), # remove non-relevant topics 
           to_kmeans.groupby('username').agg(n=('type_1', lambda x: len(list(x)))) # add tweet count 
          ], axis = 1)

## For elbow method

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def find_optimal_clusters(data, max_k):
    inertia = []
    for k in range(2, max_k+1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        inertia.append(kmeans.inertia_)
    
    plt.figure(figsize=(8, 5))
    plt.plot(range(2, max_k+1), inertia, marker='o')
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')
    plt.show()

# Call the functions with your dataframe
find_optimal_clusters(tokm_gr, 10)  # Check clusters from 2 to 10

## Fit k-means

In [None]:
# after elbow method 
k = 4
# select kmeans model
kmeans = KMeans(n_clusters=k, random_state=88)
# select scaler for standardization 
scaler = StandardScaler()
# normalize data 
df_scaled = scaler.fit_transform(tokm_gr)
# fit k-means and pass centroids to dataframe 
to_mplt = pd.DataFrame(kmeans.fit(df_scaled).cluster_centers_, columns = tokm_gr.columns, index = ['centroid ' + str(i) for i in range(k)])

# Difference between average pro non-compliance and pro compliance user

In [None]:
# merge non compliers list to tweet feed 
tokm_gr = pd.merge(dd, noncompliers.rename('non-complier') , on = 'username')
# add location information again 
tokm_gr['location'] = ['in-friuli' if i in fvg_loc else 'out-friuli' for i in tokm_gr['location']]
# select non-relevant topics 
to_drop = ['noise', 'genova port joins the protests', 'port blocking', 'port workers and workers union', 
           'prostests and Rome', 'protests and fasism', 'protests and media journalists', 
           'protests cause covid outbreak', 'puzzer', 'square of trieste ',
           'troubled protests and possible issues with protests ', 'port workers announce port block', 
           'foreign hidden influences on the port']
to_diff = pd.concat([
    # median reteweet count 
    tokm_gr.groupby(['username', 'non-complier'])['retweet_count'].agg(np.median), 
    # average location
    tokm_gr.groupby(['username', 'non-complier'])['location'].agg(lambda x: x.value_counts()[0] / len(x)),
    # frequency of emotion displayed in tweeets 
    tokm_gr.groupby(['username', 'non-complier'])['best_result'].value_counts(normalize = True).unstack(level = 2, fill_value = 0).drop(['joy', 'fear'], axis = 1),
    # frequency of topics found in tweets 
    tokm_gr.groupby(['username', 'non-complier'])['Meta'].value_counts(normalize = True).unstack(level = 2, fill_value = 0).drop(to_drop, axis = 1),
    # add n of published tweets 
    tokm_gr.groupby(['username', 'non-complier']).agg(n=('type_1', lambda x: len(list(x))))
    ], axis = 1)