## This notebook aims to filter the original dataset using OpenAI API

### Imports

In [None]:
import pandas as pd
import os
import openai
import time
from tqdm.notebook import tqdm
import tiktoken
from dotenv import load_dotenv



In [2]:
politicians_data = {
    'FdI_Meloni': {'file': 'politicians_data/GiorgiaMeloni_tweets_combined.csv', 'politician': 'Meloni', 'party': 'FdI'},
    'FdI_LaRussa': {'file': 'politicians_data/IgnazioLaRussa_tweets_combined.csv', 'politician': 'LaRussa', 'party': 'FdI'},
    'FI_Berlusconi': {'file': 'politicians_data/SilvioBerlusconi_tweets_combined.csv', 'politician': 'Berlusconi', 'party': 'FI'},
    'FI_Tajani': {'file': 'politicians_data/AntonioTajani_tweets_combined.csv', 'politician': 'Tajani', 'party': 'FI'},
    'Lega_Salvini': {'file': 'politicians_data/MatteoSalvini_tweets_combined.csv', 'politician': 'Salvini', 'party': 'Lega'},
    'M5S_DiMaio': {'file': 'politicians_data/luigidimaio_tweets_combined.csv', 'politician': 'Di Maio', 'party': 'M5S'},
    'M5S_Conte': {'file': 'politicians_data/GiuseppeConte_tweets_combined.csv', 'politician': 'Conte', 'party': 'M5S'},
    'Az_Calenda': {'file': 'politicians_data/CarloCalenda_tweets_combined.csv', 'politician': 'Calenda', 'party': 'Azione'},
    'IV_Renzi': {'file': 'politicians_data/MatteoRenzi_tweets_combined.csv', 'politician': 'Renzi', 'party': 'IV'},
    'PEeur_Bonino': {'file': 'politicians_data/emmabonino_tweets_combined.csv', 'politician': 'Bonino', 'party': 'PEeur'},
    'PD_Shlein': {'file': 'politicians_data/EllySchlein_tweets_combined.csv', 'politician': 'Schlein', 'party': 'PD'},
    'PD_Letta': {'file': 'politicians_data/EnricoLetta_tweets_combined.csv', 'politician': 'Letta', 'party': 'PD'},
    'EV_Fratoianni': {'file': 'politicians_data/NicolaFratoianni_tweets_combined.csv', 'politician': 'Fratoianni', 'party': 'EV'},
    'NcI_Lupi': {'file': 'politicians_data/MaurizioLupi_tweets_combined.csv', 'politician': 'Lupi', 'party': 'NcI'}
}

politicians_list = []
for key, data in politicians_data.items():
    df = pd.read_csv(data['file'])
    df['politician'] = data['politician']
    df['party'] = data['party']
    politicians_list.append(df)

politicians = pd.concat(politicians_list, ignore_index=True)

users = pd.read_csv('train_data/user_tweets.csv')


In [3]:
users['ideology_multiclass']= users['ideology_multiclass'].astype('category')
dict(enumerate(users['ideology_multiclass'].cat.categories))

{0: 'left', 1: 'moderate_left', 2: 'moderate_right', 3: 'right'}

In [4]:
politicians = politicians.drop(columns=[politicians.columns[2], politicians.columns[5]])

In [5]:
politicians_parties = {
    'Meloni': 3,  # 'right'
    'Salvini': 3,  # 'right'
    'Berlusconi': 2,  # 'moderate_right'
    'Letta': 0,  # 'moderate_left'
    'Conte': 1,  # 'moderate_left'
    'Calenda': 1,  # 'moderate_left'
    'Renzi': 1,  # 'moderate_left'
    'Bonino': 0,  # 'left'
    'Di Maio': 1,  # 'moderate_left'
    'Fratoianni': 0,  # 'left'
    'Bonelli': 0,  # 'left'
    'Lupi': 2,  # 'moderate_right'
    'La Russa': 3,  # 'right'
    'Tajani': 2,  # 'moderate_right'
    'Schlein': 0   # 'left'
}
politicians['ideology_num'] = politicians['politician'].map(politicians_parties)
politicians['ideology_num'].value_counts()

ideology_num
1.0    11638
3.0     7882
0.0     4575
2.0     3320
Name: count, dtype: int64

In [6]:
politicians['ideology_num'] = politicians['ideology_num'].apply(lambda x: 3 if x in (2.0, 3.0) else 0)

### We now filter the tweets using the api keeping in mind the rate limits are the following: 30 000 tokens per minute and 500 requests per minute

### To avoid the request per day limit we use this function to create bathes merging datasets while computing tokens to ensure to remain under the rate limits

In [7]:
enc = tiktoken.get_encoding("o200k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

enc = tiktoken.encoding_for_model("gpt-4o-mini")
def merge_tweets(df):
    prompts = []
    current_batch = []
    current_text = ""
    n_tokens = 0
    
    for index, row in df.iterrows():
        tweet = row['Content']
        tweet_text = str(index) + ": " + tweet + "\n"
        tweet_tokens = len(enc.encode(tweet_text))
        
        if n_tokens + tweet_tokens > 29000:
            prompts.append({
                'text': current_text,
                'indices': current_batch
            })
            current_text = tweet_text
            current_batch = [index]
            n_tokens = tweet_tokens
        else:
            current_text += tweet_text
            current_batch.append(index)
            n_tokens += tweet_tokens
    
    if current_text:
        prompts.append({
            'text': current_text,
            'indices': current_batch
        })
        
    return prompts

### Now we can classify the tweets

In [8]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
def classify_tweet_batches(prompts):
    all_results = {}
    
    for prompt_data in prompts:
        prompt_text = prompt_data['text']
        indices = prompt_data['indices']  # Now we'll use this
        
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": """
                You are an expert Italian political content analyst. I will give you a batch of tweets from Italian politicians.
                For EACH tweet, determine if it contains significant political content or is just promotional content.
                
                Significant tweets include:
                - Policy positions or proposals
                - Political criticism of opponents
                - Commentary on current events
                - Substantive discussions of issues
                
                Non-significant (broadcast) tweets include:
                - TV/radio appearance announcements
                - Live stream announcements
                - Schedule announcements
                - Simple greetings without political substance
                
                Respond with ONLY the tweet index number followed by either SIGNIFICANT or BROADCAST, one per line.
                Example format:
                123: SIGNIFICANT
                124: BROADCAST
                125: SIGNIFICANT
                """},
                {"role": "user", "content": f"Tweets:\n{prompt_text}"}
            ],
            temperature=0.0,
            max_tokens=3000
        )
        
        result = response.choices[0].message.content.strip()
        
        result_lines = result.split('\n')
        
        for i, idx in enumerate(indices):
            result_found = False
            for line in result_lines:
                if line.startswith(f"{idx}:") or line.startswith(f"{idx} :"):
                    parts = line.split(':', 1)
                    classification = "SIGNIFICANT" if "SIGNIFICANT" in parts[1].upper() else "BROADCAST"
                    all_results[idx] = classification
                    result_found = True
                    break
            
            if not result_found:
                all_results[idx] = "UNKNOWN"
    
    return all_results

In [9]:
politicians_merged= merge_tweets(politicians)

In [None]:
tweet_tokens = 0
results = {}
progress_bar = tqdm(politicians_merged, desc="Processing tweets", unit="batch")
print("Starting classification...")
for tweets in progress_bar:
    tweet_tokens += len(enc.encode(tweets['text']))
    
    batch_results = classify_tweet_batches([tweets])
    results.update(batch_results)
    
    progress_bar.set_postfix(tokens_used=tweet_tokens, results_collected=len(results))
    
    if tweet_tokens > 200000:
        print("Rate limit reached. Waiting for 60 seconds...")
        time.sleep(60)
        tweet_tokens = 0
print("Classification completed.")
politicians['classification'] = results.values()

In [22]:
politicians['classification'] = politicians['classification'].replace({'SIGNIFICANT': 1, 'BROADCAST': 0})
politicians = politicians[politicians['classification'].isin([1, 'UNKNOWN'])]
politicians.to_csv('politicians_data/politicians_classified.csv')