In [2]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from tqdm import tqdm
from transformers.tokenization_utils_base import BatchEncoding

# Load data
file_path = '../data/eu_references_rs.csv'
data = pd.read_csv(file_path)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("classla/bcms-bertic-parlasent-bcs-ter")
model = AutoModelForSequenceClassification.from_pretrained("classla/bcms-bertic-parlasent-bcs-ter")



In [4]:
def chunk_text(text, max_token_length=512):
    tokenized_text = tokenizer([text], truncation=False, padding=False)
    raw_tokens = tokenized_text["input_ids"][0]
    token_length = len(raw_tokens)

    if token_length > max_token_length:
        # How many 'chunks' of max_token_length fit into the total length
        int_quotient = token_length // max_token_length

        # Find the index / interval for where to cut the chunks
        split_index = max_token_length
        token_batches = [
            raw_tokens[i * split_index: (i + 1) * split_index]
            for i in range(int_quotient + 1)
        ]

        # Rebuild the prediction dataset for each chunk
        batch_dict = {
            "input_ids": token_batches,
            "attention_mask": [[1] * len(token_batch) for token_batch in token_batches],
        }
        tokenized_text = {
            "input_ids": [token_batch + [tokenizer.pad_token_id] * (max_token_length - len(token_batch)) for token_batch in token_batches],
            "attention_mask": [[1] * len(token_batch) + [0] * (max_token_length - len(token_batch)) for token_batch in token_batches],
        }
    else:
        # Pad the sequence to max_token_length
        tokenized_text = tokenizer([text], truncation=True, max_length=max_token_length, padding='max_length')

    return tokenized_text


In [5]:
# Initialize lists to store tokenized input_ids and attention_mask
input_ids_list = []
attention_mask_list = []

# Tokenize speeches and collect input_ids and attention_mask
for speech in tqdm(data['speech']):
    tokens = chunk_text(speech)
    input_ids_list.extend(tokens['input_ids'])
    attention_mask_list.extend(tokens['attention_mask'])

# Create dictionary for Dataset.from_dict()
tokenized_data = {
    'input_ids': input_ids_list,
    'attention_mask': attention_mask_list,
}


  0%|          | 0/3485 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1340 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 3485/3485 [00:14<00:00, 239.44it/s]


In [6]:
# Create Dataset object
pred_dataset = Dataset.from_dict(tokenized_data)

# Ensure all input_ids and attention_mask are in a format expected by the model
pred_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
output_dir = os.path.expanduser('~/transformers_output')
os.makedirs(output_dir, exist_ok=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=8,
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
)



In [7]:
# Function to get the prediction label for each batch
def get_prediction_labels(predictions):
    pred_logits = torch.tensor(predictions.predictions)
    pred_probs = torch.softmax(pred_logits, dim=-1)
    pred_labels = torch.argmax(pred_probs, dim=-1)
    return pred_labels



In [8]:
# Make predictions
predictions = trainer.predict(pred_dataset)

# Get prediction labels
prediction_labels = get_prediction_labels(predictions).tolist()

# Assign prediction labels back to the speeches
data['sentiment_score'] = prediction_labels[:len(data)] 

# Map numeric labels to sentiment strings
sentiment_map = {0: 'NEGATIVE', 1: 'NEUTRAL', 2: 'POSITIVE'}
data['sentiment'] = data['sentiment_score'].map(sentiment_map)


  0%|          | 0/1525 [00:00<?, ?it/s]

In [31]:
rs_minutes_tks = pd.read_csv('../data/rs_minutes_tks.csv')

#merge data to rs_minutes_tks on speech 
rs_minutes = rs_minutes_tks.merge(data, on='speech', suffixes=('', '_data'))
# drop columns ends with _data
rs_minutes = rs_minutes[rs_minutes.columns.drop(list(rs_minutes.filter(regex='_data')))]

rs_minutes.to_csv('../data/rs_minutes_tks.csv', index=False)

In [23]:
rs_minutes = pd.read_csv('../data/rs_minutes_tks.csv')

In [34]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Assuming rs_minutes is already defined and contains the necessary data

# Convert the Date column to datetime format
rs_minutes['Date'] = pd.to_datetime(rs_minutes['Date'])

# Update party names for LDP, SDS, Nova, and LSV to PRS
parties_to_replace = ['LDP', 'SDS', 'Nova', 'LSV']
rs_minutes['Speaker_party'] = rs_minutes['Speaker_party'].replace(parties_to_replace, 'PRS')

# Filter to include only the specified parties
parties_of_interest = ['SNS', 'DS', 'SRS', 'SPS', 'DSS', 'PRS']
rs_minutes = rs_minutes[rs_minutes['Speaker_party'].isin(parties_of_interest)]

# Create a month-year column for grouping
rs_minutes['MonthYear'] = rs_minutes['Date'].dt.to_period('M')

# Filter positive and negative speeches
positive_speeches = rs_minutes[rs_minutes['sentiment'] == 'POSITIVE']
negative_speeches = rs_minutes[rs_minutes['sentiment'] == 'NEGATIVE']

# Group by MonthYear and Speaker_party and count the number of speeches
positive_counts = positive_speeches.groupby(['MonthYear', 'Speaker_party']).size().reset_index(name='Positive Speeches')
negative_counts = negative_speeches.groupby(['MonthYear', 'Speaker_party']).size().reset_index(name='Negative Speeches')

# Get unique months and parties
months = rs_minutes['MonthYear'].unique()
parties = rs_minutes['Speaker_party'].unique()

# Create a complete grid of MonthYear and Speaker_party
complete_grid = pd.MultiIndex.from_product([months, parties], names=['MonthYear', 'Speaker_party']).to_frame(index=False)

# Merge this complete grid with positive and negative counts, filling NaNs with 0
positive_counts = complete_grid.merge(positive_counts, on=['MonthYear', 'Speaker_party'], how='left').fillna(0)
negative_counts = complete_grid.merge(negative_counts, on=['MonthYear', 'Speaker_party'], how='left').fillna(0)

# Optionally, ensure the 'Positive Speeches' and 'Negative Speeches' columns are integers
positive_counts['Positive Speeches'] = positive_counts['Positive Speeches'].astype(int)
negative_counts['Negative Speeches'] = negative_counts['Negative Speeches'].astype(int)

positive_counts['MonthYear'] = positive_counts['MonthYear'].dt.to_timestamp()
negative_counts['MonthYear'] = negative_counts['MonthYear'].dt.to_timestamp()

# Define party colors
party_colors = {
    'SNS': 'blue',
    'SPS': 'red',
    'SRS': 'purple',
    'DS': 'darkgoldenrod',
    'DSS': 'green',
    'PRS': 'orange'  # Update color for PRS
}

max_positive_y = positive_counts['Positive Speeches'].max()
max_negative_y = negative_counts['Negative Speeches'].max()
max_y_value = max(max_positive_y, max_negative_y)

# Ensure the y-axis maximum is at least 37
yaxis_max = max(max_y_value, 29)

# Create subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('Positive Speeches', 'Negative Speeches'))

# Add positive speeches plot
for party in parties_of_interest:
    party_data = positive_counts[positive_counts['Speaker_party'] == party]
    fig.add_trace(go.Scatter(x=party_data['MonthYear'], y=party_data['Positive Speeches'], 
                             mode='lines', name=party, line=dict(color=party_colors[party]), legendgroup=party),
                  row=1, col=1)

# Add negative speeches plot
for party in parties_of_interest:
    party_data = negative_counts[negative_counts['Speaker_party'] == party]
    fig.add_trace(go.Scatter(x=party_data['MonthYear'], y=party_data['Negative Speeches'], 
                             mode='lines', name=party, line=dict(color=party_colors[party]), legendgroup=party, showlegend=False),
                  row=1, col=2)

# Update layout
fig.update_layout(title='Positive and Negative Speeches per Party by Month-Year', 
                  xaxis_title='Year', 
                  yaxis_title='Number of Speeches',
                  showlegend=True,
                  height=600, width=1200,
                  yaxis=dict(range=[0, yaxis_max]), 
                  yaxis2=dict(range=[0, yaxis_max]))

fig.update_xaxes(nticks=20)

fig.show()


In [8]:
from statsmodels.nonparametric.smoothers_lowess import lowess

# Calculate net sentiment scores
rs_minutes['Net Sentiment'] = rs_minutes['sentiment'].map({'NEGATIVE': -1, 'NEUTRAL': 0, 'POSITIVE': 1})

# Group by MonthYear and calculate net sentiment
net_sentiment = rs_minutes.groupby(['MonthYear'])['Net Sentiment'].mean().reset_index()

# Convert MonthYear back to datetime for plotting
net_sentiment['MonthYear'] = net_sentiment['MonthYear'].dt.to_timestamp()

# Create a new figure for net sentiment
fig2 = go.Figure()

# Add trace for net sentiment
fig2.add_trace(go.Scatter(x=net_sentiment['MonthYear'], y=net_sentiment['Net Sentiment'], 
                          mode='lines', name='Net Sentiment',
                          line=dict(color='blue', dash='dash')))

# Apply LOWESS smoothing
lowess_smoothed = lowess(net_sentiment['Net Sentiment'], net_sentiment['MonthYear'], frac=0.1)
lowess_x = pd.to_datetime(lowess_smoothed[:, 0])
lowess_y = lowess_smoothed[:, 1]

# Add trace for smoothed trend
fig2.add_trace(go.Scatter(x=lowess_x, y=lowess_y, 
                          mode='lines', name='Smoothed Trend',
                          line=dict(color='red', width=3)))

# Update layout for fig2
fig2.update_layout(title='Net Sentiment over Time', 
                   xaxis_title='Year', 
                   yaxis_title='Net Sentiment Score',
                   showlegend=True,
                   legend=dict(
                       x=0.5,  
                       y=-0.2,  
                       xanchor='center', 
                       yanchor='middle', 
                       orientation='h'  
                   ),
                   height=600, width=1000)

fig2.update_xaxes(nticks=20)

fig2.show()

In [32]:
import pandas as pd
import plotly.graph_objects as go
from statsmodels.nonparametric.smoothers_lowess import lowess


# Convert the Date column to datetime format
rs_minutes['Date'] = pd.to_datetime(rs_minutes['Date'])

# Create a MonthYear column for grouping
rs_minutes['MonthYear'] = rs_minutes['Date'].dt.to_period('M')

# Count the number of EU-specific speeches per month
eu_speech_count = rs_minutes.groupby(['MonthYear']).size().reset_index(name='EU Speech Count')

# Convert MonthYear back to datetime for plotting
eu_speech_count['MonthYear'] = eu_speech_count['MonthYear'].dt.to_timestamp()

# Create a new figure for the number of EU-specific speeches
fig4 = go.Figure()

# Add trace for the number of EU-specific speeches
fig4.add_trace(go.Scatter(x=eu_speech_count['MonthYear'], y=eu_speech_count['EU Speech Count'], 
                          mode='lines+markers', name='Number of EU Speeches',
                          line=dict(color='blue'), 
                          marker=dict(size=6, color='blue')))

# Apply LOWESS smoothing
lowess_smoothed = lowess(eu_speech_count['EU Speech Count'], eu_speech_count['MonthYear'].map(pd.Timestamp.toordinal), frac=0.1)
lowess_x = pd.to_datetime([pd.Timestamp.fromordinal(int(d)) for d in lowess_smoothed[:, 0]])
lowess_y = lowess_smoothed[:, 1]

# Add trace for smoothed trend
fig4.add_trace(go.Scatter(x=lowess_x, y=lowess_y, 
                          mode='lines', name='Smoothed Trend',
                          line=dict(color='red', width=3)))

# Define x-axis range and ticks more robustly
xaxis_range = [pd.to_datetime('2010-11-30'), pd.to_datetime('2022-08-01')]
xaxis_ticks = [pd.to_datetime(f'{year}-01-01') for year in range(2011, 2024)]

# Update layout for fig4
fig4.update_layout(
    title={
        'text': 'Monthly Number of EU References in the Serbian Parliament (2011-2023)', 
        'x': 0.5, 
        'xanchor': 'center', 
        'yanchor': 'top'
    }, 
    xaxis=dict(
        title='Year',
        range=xaxis_range,
        tickmode='array',
        tickvals=xaxis_ticks,
        ticktext=[str(year) for year in range(2011, 2024)]
    ),
    yaxis_title='Number of EU Speeches',
    showlegend=True,
    legend=dict(
        x=0.5,  
        y=-0.2,  
        xanchor='center', 
        yanchor='middle', 
        orientation='h'  
    ),
    height=600, width=1000
)

fig4.update_xaxes(nticks=20)

fig4.show()


In [37]:
file_path_all = '../data/pre-processed_rs.csv'
file_path_eu = '../data/rs_minutes_tks.csv'
rs_minutes_all = pd.read_csv(file_path_all)
rs_minutes_all['SpeechLength'] = rs_minutes_all['speech'].apply(len)
rs_minutes_all = rs_minutes_all[rs_minutes_all['SpeechLength'] > 100]
rs_minutes_eu = pd.read_csv(file_path_eu)

# Convert the Date columns to datetime format
rs_minutes_all['Date'] = pd.to_datetime(rs_minutes_all['Date'])
rs_minutes_eu['Date'] = pd.to_datetime(rs_minutes_eu['Date'])

# Create MonthYear columns for grouping
rs_minutes_all['MonthYear'] = rs_minutes_all['Date'].dt.to_period('M')
rs_minutes_eu['MonthYear'] = rs_minutes_eu['Date'].dt.to_period('M')

# Count the number of total speeches and EU-specific speeches per month
total_speech_count = rs_minutes_all.groupby(['MonthYear']).size().reset_index(name='Total Speech Count')
eu_speech_count = rs_minutes_eu.groupby(['MonthYear']).size().reset_index(name='EU Speech Count')

# Merge the counts based on MonthYear
speech_counts = pd.merge(total_speech_count, eu_speech_count, on='MonthYear', how='left').fillna(0)

# Convert MonthYear back to datetime for plotting
speech_counts['MonthYear'] = speech_counts['MonthYear'].dt.to_timestamp()

# Calculate the proportion of EU-specific speeches to total speeches
speech_counts['EU Speech Proportion'] = speech_counts['EU Speech Count'] / speech_counts['Total Speech Count']

# Create a new figure for the proportion of EU-specific speeches
fig3 = go.Figure()

# Add trace for the proportion of EU-specific speeches
fig3.add_trace(go.Scatter(x=speech_counts['MonthYear'], y=speech_counts['EU Speech Proportion'], 
                          mode='lines', name='Proportion of EU Speeches',
                          line=dict(color='blue', dash='dash')))

# Apply LOWESS smoothing
lowess_smoothed = lowess(speech_counts['EU Speech Proportion'], speech_counts['MonthYear'].map(pd.Timestamp.toordinal), frac=0.1)
lowess_x = pd.to_datetime([pd.Timestamp.fromordinal(int(d)) for d in lowess_smoothed[:, 0]])
lowess_y = lowess_smoothed[:, 1]

# Add trace for the smoothed trend
fig3.add_trace(go.Scatter(x=lowess_x, y=lowess_y, 
                          mode='lines', name='Smoothed Trend',
                          line=dict(color='red', width=3)))

# Define x-axis range and ticks
xaxis_range = [pd.to_datetime('2010-10-01'), pd.to_datetime('2022-08-30')]
xaxis_ticks = [pd.to_datetime(f'{year}-01-01') for year in range(2011, 2023)]

# Update layout for fig3
fig3.update_layout(
    title={
        'text': 'Monthly Proportions of EU References Relative to Total Speeches in the Serbian National Assembly (2011-2022)', 
        'x': 0.5, 
        'xanchor': 'center', 
        'yanchor': 'top'
    }, 
    xaxis=dict(
        range=xaxis_range,
        tickmode='array',
        tickvals=xaxis_ticks,
        ticktext=[str(year) for year in range(2011, 2023)]
    ),
    yaxis_title='Proportion of EU Speeches',
    showlegend=False,
    height=600, width=1000
)

fig3.update_xaxes(nticks=20)

fig3.show()


In [25]:

# Create a year column for grouping
rs_minutes['Year'] = rs_minutes['Date'].dt.year

# Group by Year and sentiment, and count the number of speeches
sentiment_counts = rs_minutes.groupby(['Year', 'sentiment']).size().reset_index(name='Count')

# Pivot the data to have sentiments as columns
sentiment_pivot = sentiment_counts.pivot(index='Year', columns='sentiment', values='Count').fillna(0).reset_index()

# Define sentiment colors
sentiment_colors = {
    'POSITIVE': 'green',
    'NEUTRAL': 'blue',
    'NEGATIVE': 'red'
}

# Create a stacked bar chart
fig5 = go.Figure()

# Add bars for each sentiment
for sentiment in ['POSITIVE', 'NEUTRAL', 'NEGATIVE']:
    fig5.add_trace(go.Bar(
        x=sentiment_pivot['Year'],
        y=sentiment_pivot[sentiment],
        name=sentiment.capitalize(),
        marker_color=sentiment_colors[sentiment]
    ))

# Update layout
fig5.update_layout(
    barmode='stack',
    xaxis_title='Year',
    yaxis_title='Number of Speeches',
    showlegend=True,
    legend=dict(
        x=0.5,
        y=-0.2,
        xanchor='center',
        yanchor='middle',
        orientation='h'
    ),
    height=600,
    width=1000
)

fig5.show()
