In [None]:
import ast
import re
import time

import pandas as pd
import numpy as np

from fuzzywuzzy import fuzz

In [None]:
df = pd.read_csv('tweets/tweets_22.csv', encoding='latin1')

### preprocess

In [None]:
# Preprocessing tweet text by removing URLs, mentions, non-word characters, 
# retweet markers (RT), and converting text to lowercase.

df['process_text'] = df['text'].str.replace(r'https\S+', '', regex=True).str.replace(r'\@\w+|[^\w\s]', '', regex=True).str.replace(r'\bRT\b', '', regex=True).str.lower()

# Filtering out tweets containing any of the specified keywords

keywords = ['cinema', 'film', 'attila', 'unni', 'virzi', 'virzì', 'boxoffice', 'venezia79']
keywords = r'(?:' + '|'.join(keywords) + r')'

df_filtered = df[~df['text'].str.contains(keywords, case=False, na=False)]
df_filtered = df_filtered.dropna(subset=['process_text'])

In [None]:
# Remove duplicate tweets based on fuzzy matching (similarity threshold > 90)

def remove_dup(df):
    uniques = []
    duplicates = []

    for index, row in df.iterrows():
        tweet = row['process_text']
        if not any(fuzz.ratio(tweet, unique) > 90 for unique in uniques):
            uniques.append(tweet)
        else:
            duplicates.append(index)

    return df.drop(index=duplicates)

df_processed = remove_dup(df_filtered)

### location extraction

In [None]:
import spacy
# !python -m spacy download it_core_news_sm

In [None]:
nlp_it = spacy.load("it_core_news_sm")

In [None]:
# Extract Italian geographic entities (LOC, GPE) from text

def find_loc_gpe_it(text):
    doc = nlp_it(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in ['LOC', 'GPE']:
            entities.append(ent.text)
    return entities

df_processed['loc'] = df_processed['process_text'].apply(find_loc_gpe_it)

In [None]:
# Load Italian geonames for matching

geonames = pd.read_excel('tweets/geonames.xlsx')
locations_it = [item.lower() if isinstance(item, str) else item for item in geonames[2].tolist()]

df_italy = df_processed[df_processed['loc'].apply(lambda x: any(loc in x for loc in locations_it))]

In [None]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut, GeocoderServiceError
from geopy.extra.rate_limiter import RateLimiter
from requests.exceptions import RequestException

In [None]:
# Dictionary mapping keywords to custom NUTS2 regions

location_to_nuts2_custom = {
    'po': ['ITC1', 'ITC4', 'ITH3', 'ITH4', 'ITH5'],
    'po valley': ['ITC1', 'ITC4', 'ITH3', 'ITH4', 'ITH5'],
    'po river': ['ITC1', 'ITC4', 'ITH3', 'ITH4', 'ITH5'],
    'the po valley': ['ITC1', 'ITC4', 'ITH3', 'ITH4', 'ITH5'],
    'alpi': ['ITC1', 'ITC2', 'ITC4', 'ITH1', 'ITH2', 'ITH3', 'ITH4']
}

# Map extracted locations to NUTS2 codes
def to_nuts2_custom(locations):
    nuts_regions = set()
    for loc in locations:
        for key, value in location_to_nuts2_custom.items():            
            if key == loc:
                nuts_regions.update(value)
                break 
    return list(nuts_regions)

df_italy['nuts'] = df_italy['loc'].apply(lambda x: to_nuts2_custom(x))

In [None]:
# Dictionary mapping keywords to NUTS2 regions

location_to_nuts2 = {
    'abruzzo': ['ITF1'],
    'aosta': ['ITC2'],
    'daosta': ['ITC2'],
    'basilicata': ['ITF5'],
    'bolzano bozen': ['ITH1'],
    'bolzanobozen': ['ITH1'],
    'calabria': ['ITF6'],
    'campania': ['ITF3'],
    'emilia': ['ITH5'],
    'emilia romagna': ['ITH5'],
    'emiliaromagna': ['ITH5'],
    'romagna': ['ITH5'],
    'friuli venezia giulia': ['ITH4'],
    'friulivenezia giulia': ['ITH4'],
    'friuliveneziagiulia': ['ITH4'],
    'lazio': ['ITI4'],
    'liguria': ['ITC3'],
    'lombardia': ['ITC4'],
    'lombardy': ['ITC4'],
    'marche': ['ITI3'],
    'molise': ['ITF2'],
    'piemonte': ['ITC1'],
    'puglia': ['ITF4'],
    'sardegna': ['ITG2'],
    'sardinia': ['ITG2'],
    'sicilia': ['ITG1'],
    'sicily': ['ITG1'],
    'toscana': ['ITI1'],
    'trentino alto adige': ['ITH2'],
    'trentinoalto adige': ['ITH2'],
    'trentinoaltoadige': ['ITH2'],
    'umbria': ['ITI2'],
    'veneto': ['ITH3'],
    
    'monte bianco': ['ITC2']
}

# Map extracted locations to NUTS2 codes

def to_nuts2(locations):
    nuts_regions = set()
    region_names = list(location_to_nuts2.keys())
    for loc in locations:
        for region in region_names:
            if region in loc:
                nuts_regions.update(location_to_nuts2[region])
                loc = loc.replace(region, "")
        remaining_parts = loc.strip().split()
        for part in remaining_parts:
            for region in region_names:
                if region.startswith(part):
                    nuts_regions.update(location_to_nuts2[region])
                    break
    return list(nuts_regions)

df_italy['nuts'] = df_italy.apply(
    lambda row: to_nuts2(row['loc']) if row['nuts'] == [] else row['nuts'], axis=1)

In [None]:
# Dictionary mapping keywords to NUTS3 regions

location_to_nuts3_custom = {
    'torino': ['ITC11'],
    'vercelli': ['ITC12'],
    'biella': ['ITC13'],
    'verbano-cusio-ossola': ['ITC14'],
    'novara': ['ITC15'],
    'cuneo': ['ITC16'],
    'asti': ['ITC17'],
    'alessandria': ['ITC18'],
    'imperia': ['ITC31'],
    'savona': ['ITC32'],
    'genova': ['ITC33'],
    'la spezia': ['ITC34'],
    'varese': ['ITC41'],
    'como': ['ITC42'],
    'lecco': ['ITC43'],
    'sondrio': ['ITC44'],
    'bergamo': ['ITC46'],
    'brescia': ['ITC47'],
    'pavia': ['ITC48'],
    'lodi': ['ITC49'],
    'cremona': ['ITC4A'],
    'mantova': ['ITC4B'],
    'milano': ['ITC4C'],
    'milan': ['ITC4C'],
    'monza e della brianza': ['ITC4D'],
    'l’aquila': ['ITF11'],
    'teramo': ['ITF12'],
    'pescara': ['ITF13'],
    'chieti': ['ITF14'],
    'isernia': ['ITF21'],
    'campobasso': ['ITF22'],
    'caserta': ['ITF31'],
    'benevento': ['ITF32'],
    'napoli': ['ITF33'],
    'avellino': ['ITF34'],
    'salerno': ['ITF35'],
    'taranto': ['ITF43'],
    'brindisi': ['ITF44'],
    'lecce': ['ITF45'],
    'foggia': ['ITF46'],
    'bari': ['ITF47'],
    'barletta-andria-trani': ['ITF48'],
    'potenza': ['ITF51'],
    'matera': ['ITF52'],
    'cosenza': ['ITF61'],
    'crotone': ['ITF62'],
    'catanzaro': ['ITF63'],
    'vibo valentia': ['ITF64'],
    'reggio calabria': ['ITF65'],
    'trapani': ['ITG11'],
    'palermo': ['ITG12'],
    'messina': ['ITG13'],
    'agrigento': ['ITG14'],
    'caltanissetta': ['ITG15'],
    'enna': ['ITG16'],
    'catania': ['ITG17'],
    'ragusa': ['ITG18'],
    'siracusa': ['ITG19'],
    'sassari': ['ITG2D'],
    'nuoro': ['ITG2E'],
    'cagliari': ['ITG2F'],
    'oristano': ['ITG2G'],
    'sud sardegna': ['ITG2H'],
    'bolzano-bozen': ['ITH10'],
    'trento': ['ITH20'],
    'verona': ['ITH31'],
    'vicenza': ['ITH32'],
    'belluno': ['ITH33'],
    'treviso': ['ITH34'],
    'venezia': ['ITH35'],
    'venice': ['ITH35'],
    'padova': ['ITH36'],
    'rovigo': ['ITH37'],
    'pordenone': ['ITH41'],
    'udine': ['ITH42'],
    'gorizia': ['ITH43'],
    'trieste': ['ITH44'],
    'piacenza': ['ITH51'],
    'parma': ['ITH52'],
    'reggio nell’emilia': ['ITH53'],
    'modena': ['ITH54'],
    'bologna': ['ITH55'],
    'ferrara': ['ITH56'],
    'ravenna': ['ITH57'],
    'forlì-cesena': ['ITH58'],
    'rimini': ['ITH59'],
    'massa-carrara': ['ITI11'],
    'lucca': ['ITI12'],
    'pistoia': ['ITI13'],
    'firenze': ['ITI14'],
    'prato': ['ITI15'],
    'livorno': ['ITI16'],
    'pisa': ['ITI17'],
    'arezzo': ['ITI18'],
    'siena': ['ITI19'],
    'grosseto': ['ITI1A'],
    'perugia': ['ITI21'],
    'terni': ['ITI22'],
    'pesaro e urbino': ['ITI31'],
    'ancona': ['ITI32'],
    'macerata': ['ITI33'],
    'ascoli piceno': ['ITI34'],
    'fermo': ['ITI35'],
    'viterbo': ['ITI41'],
    'rieti': ['ITI42'],
    'roma': ['ITI43'],
    'rome': ['ITI43'],
    'latina': ['ITI44'],
    'frosinone': ['ITI45'],
    
    'garda': ['ITC47','ITH20', 'ITH32'],
    'maggiore': ['ITC14', 'ITC15', 'ITC41'],
    'lago di como': ['ITC42', 'ITC43'],
    'lago diseo': ['ITC46', 'ITC47'],
    
    'salento': ['ITF43', 'ITF44', 'ITF45'],
    'polesine': ['ITH37'],
    'tevere': ['ITI43'],
    'reno': ['ITH54', 'ITH55', 'ITH56'],
    'isonzo':['ITH43', 'ITH544'],
    
    'san martino':['N/A'],
    'paese':['N/A'],
    'montagna':['N/A'],
    'duomo':['N/A']
}

# Map extracted locations to NUTS3 codes

def to_nuts3_custom(locations):
    nuts_regions = set()
    region_names = list(location_to_nuts3_custom.keys())    
    for loc in locations:
        for region in region_names:
            if region in loc:
                nuts_regions.update(location_to_nuts3_custom[region])
                break 
    return list(nuts_regions)

df_italy['nuts'] = df_italy.apply(
    lambda row: to_nuts3_custom(row['loc']) if row['nuts'] == [] else row['nuts'], axis=1)

In [None]:
# Filter out rows where the nuts column is empty

df_blank_nuts = df_italy[df_italy['nuts'].apply(lambda x: len(x) == 0)]

In [None]:
# Obtain the NUTS3 code for a given location using the Nominatim geolocation API

def get_nuts3(location):
    geolocator = Nominatim(user_agent="nuts3_locator")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    
    try:
        location_data = geocode(location, addressdetails=True, extratags=True)
        if location_data and 'address' in location_data.raw:
            address = location_data.raw['address']
            print(address)
            
            if address.get('country_code') == 'it':
                if 'ISO3166-2-lvl6' in address:
                    return address['ISO3166-2-lvl6'], 'lvl6'
                elif 'ISO3166-2-lvl4' in address:
                    return address['ISO3166-2-lvl4'], 'lvl4'
                elif 'ISO3166-2-lvl8' in address:
                    return address['ISO3166-2-lvl8'], 'lvl8'
    except (GeocoderTimedOut, RequestException) as e:
        print(f"Geocoding error: {e}")
        return None, None

    return None, None

def process_locations(df):
    for idx, locations in df['loc'].items():
        location_list = [loc.strip() for loc in locations]
        nuts3_codes = []
        nuts3_levels = []
        
        for location in location_list:
            nuts3_code = None
            level = None
            retries = 3
            while retries > 0:
                try:
                    nuts3_code, level = get_nuts3(location)
                    break
                except GeocoderTimedOut:
                    retries -= 1
                    time.sleep(2) 
            if nuts3_code is not None:
                nuts3_codes.append(nuts3_code)
                nuts3_levels.append(level)
            time.sleep(1) 
            
        seen = set()
        unique_nuts3_codes = [code for code in nuts3_codes if not (code in seen or seen.add(code))]
        
        df.loc[idx, 'nuts3'] = ', '.join(unique_nuts3_codes)
        df.loc[idx, 'level'] = ', '.join(nuts3_levels)
        
        print(f"{locations}: {', '.join(unique_nuts3_codes)} (Levels: {', '.join(nuts3_levels)})")
        
    return df

df_blank_nuts = process_locations(df_blank_nuts)

In [None]:
# Dictionary Nominatim NUTS3 codes to normal NUTS3 codes

nuts3_mapping = {
    'IT-AG': 'ITG14',
    'IT-AL': 'ITC18',
    'IT-AN': 'ITI32',
    'IT-AR': 'ITI18',
    'IT-AP': 'ITI34',
    'IT-AV': 'ITF34',
    'IT-BA': 'ITF47',
    'IT-BT': 'ITF48',
    'IT-BL': 'ITH33',
    'IT-BN': 'ITF32',
    'IT-BG': 'ITC46',
    'IT-BI': 'ITC13',
    'IT-BO': 'ITH55',
    'IT-BS': 'ITC47',
    'IT-CA': 'ITG2F',
    'IT-CL': 'ITG15',
    'IT-CE': 'ITF31',
    'IT-CT': 'ITG17',
    'IT-CH': 'ITF14',
    'IT-CO': 'ITC42',
    'IT-CS': 'ITF61',
    'IT-CR': 'ITC4A',
    'IT-CN': 'ITC16',
    'IT-EN': 'ITG16',
    'IT-FM': 'ITI35',
    'IT-FE': 'ITH56',
    'IT-FI': 'ITI14',
    'IT-FG': 'ITF46',
    'IT-FC': 'ITH58',
    'IT-FR': 'ITI45',
    'IT-GE': 'ITC33',
    'IT-GO': 'ITH43',
    'IT-GR': 'ITI1A',
    'IT-IM': 'ITC31',
    'IT-IS': 'ITF21',
    'IT-AQ': 'ITF11',
    'IT-SP': 'ITC34',
    'IT-LT': 'ITI44',
    'IT-LE': 'ITF45',
    'IT-LC': 'ITC43',
    'IT-LI': 'ITI16',
    'IT-LO': 'ITC49',
    'IT-LU': 'ITI12',
    'IT-MC': 'ITI33',
    'IT-MN': 'ITC4B',
    'IT-MS': 'ITI11',
    'IT-ME': 'ITG13',
    'IT-MI': 'ITC4C',
    'IT-MO': 'ITH54',
    'IT-MB': 'ITC4D',
    'IT-NA': 'ITF33',
    'IT-NO': 'ITC15',
    'IT-PD': 'ITH36',
    'IT-PA': 'ITG12',
    'IT-PR': 'ITH52',
    'IT-PV': 'ITC48',
    'IT-PG': 'ITI21',
    'IT-PU': 'ITI31',
    'IT-PC': 'ITH51',
    'IT-PN': 'ITH41',
    'IT-PZ': 'ITF51',
    'IT-BZ': 'ITH10',
    'IT-TN': 'ITH20',
    'IT-RG': 'ITG18',
    'IT-RA': 'ITH57',
    'IT-RC': 'ITF65',
    'IT-RE': 'ITH53',
    'IT-RI': 'ITI42',
    'IT-RM': 'ITI43',
    'IT-RO': 'ITH37',
    'IT-SA': 'ITF35',
    'IT-SS': 'ITG2D',
    'IT-SV': 'ITC32',
    'IT-SI': 'ITI19',
    'IT-SO': 'ITC44',
    'IT-SU': 'ITG2H',
    'IT-TR': 'ITI22',
    'IT-TO': 'ITC11',
    'IT-TP': 'ITG11',
    'IT-TV': 'ITH34',
    'IT-UD': 'ITH42',
    'IT-VA': 'ITC41',
    'IT-VE': 'ITH35',
    'IT-VB': 'ITC14',
    'IT-VC': 'ITC12',
    'IT-VR': 'ITH31',
    'IT-VI': 'ITH32',
    'IT-VT': 'ITI41',
    
    'IT-23': 'ITC2',
    'IT-45': 'ITH3',
    'IT-34': 'ITH4',
    'IT-25': 'ITC4',
    'IT-36': 'ITH5',
    'IT-52': 'ITI1',
    'IT-21': 'ITC1',
    'IT-57': 'ITI4'
}

In [None]:
# Map Nominatim extracted locations to NUTS3 codes

def map_nuts(nuts3_list, mapping):
    if all(item in mapping for item in nuts3_list):
        return [mapping.get(item) for item in nuts3_list]
    else:
        return ['']
    
df_blank_nuts['nuts3'] = df_blank_nuts['nuts3'].apply(lambda x: x.split(', '))
df_blank_nuts['nuts'] = df_blank_nuts['nuts3'].apply(lambda x: map_nuts(x, nuts3_mapping))

In [None]:
# Combine the Nominatim results with previous results

df_italy.loc[df_blank_nuts.index] = df_blank_nuts
df_italy = df_italy.drop('loc', axis = 1)

df_italy_nuts = df_italy[df_italy['nuts'].apply(lambda x: x != [''] and x != ['N/A'])]

### remove stopwords for embeddings

In [None]:
# Remove numbers and entities for embeddings, while drought-related locations are preserved

def remove_loc_it(text):
    doc = nlp_it(text)
    new_text = text

    preserve_keywords = ['po', 'tevere', 'lago', 'naviglio', 'canale', 'adda', 'ladda', 'oglio']

    replacements = []

    for ent in doc.ents:
        if ent.label_ in ['LOC', 'GPE']:
            if not any(keyword in ent.text.lower() for keyword in preserve_keywords):
                replacements.append((ent.start_char, ent.end_char))

    for token in doc:
        if token.like_num:
            replacements.append((token.idx, token.idx + len(token.text)))

    replacements.sort(reverse=True)

    for start, end in replacements:
        new_text = new_text[:start] + new_text[end:]

    return new_text

df_italy_nuts['noent_text'] = df_italy_nuts['process_text'].apply(remove_loc_it)

In [None]:
from nltk.corpus import stopwords

In [None]:
# Remove stopwords and specific Italian stopwords

italian_stopwords = set(stopwords.words('italian'))
keywords = ['clima','climatica','climatici','cambiamento climatico','cambiamentoclimatico','caldo','temperatura',
            'precipitazione','precipitazioni','pioggia','piogge','maltempo','siccità','siccita',
            'cè','qui','via','ore','giorni','mesi','anni','così','già','fa','fatto',
            'drought','temperature','climate','climateemergency','climatecrisis','climatechange','the']

all_keywords = set(keywords + list(italian_stopwords))

pattern = r'\b(?:' + '|'.join(map(re.escape, all_keywords)) + r')\b'

df_italy_nuts['text_clean'] = df_italy_nuts['noent_text'].apply(lambda x: re.sub(pattern, '', x, flags=re.IGNORECASE))
df_italy_nuts['text_clean'] = df_italy_nuts['text_clean'].str.strip().replace(r'\s+', ' ', regex=True)

df_embeddings = df_italy_nuts[df_italy_nuts['text_clean'] != ''].dropna(subset=['text_clean'])

In [None]:
df_embeddings.to_excel('tweets/tweets_11578_clean.xlsx')