In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np


In [3]:

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")

def chunk_text(text, tokenizer, max_token_length=512):
    tokenized_text = tokenizer(text, truncation=False, padding=False, return_tensors='pt')
    raw_tokens = tokenized_text["input_ids"].squeeze().tolist()
    token_length = len(raw_tokens)
    
    if token_length > max_token_length:
        chunks = [raw_tokens[i:i + max_token_length] for i in range(0, len(raw_tokens), max_token_length)]
        input_ids = [chunk + [tokenizer.pad_token_id] * (max_token_length - len(chunk)) for chunk in chunks]
        attention_mask = [[1] * len(chunk) + [0] * (max_token_length - len(chunk)) for chunk in chunks]
    else:
        input_ids = tokenized_text["input_ids"].squeeze().tolist()
        attention_mask = tokenized_text["attention_mask"].squeeze().tolist()
        input_ids = [input_ids + [tokenizer.pad_token_id] * (max_token_length - len(input_ids))]
        attention_mask = [attention_mask + [0] * (max_token_length - len(attention_mask))]
    
    return {"input_ids": input_ids, "attention_mask": attention_mask}

class TextDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'input_ids': torch.tensor(row['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(row['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(row['labels'], dtype=torch.long)
        }

def train_model(train_loader, model, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate_model(eval_loader, model, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions)

# Load your data
labeled_mins = pd.read_csv('../data/to_label.csv', delimiter=';')

# Tokenize and chunk the texts
expanded_data = []
for idx, row in labeled_mins.iterrows():
    tokenized_chunks = chunk_text(row['speech'], tokenizer)
    for input_ids, attention_mask in zip(tokenized_chunks['input_ids'], tokenized_chunks['attention_mask']):
        expanded_data.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': row['Label']  # Ensure 'Label' is the correct column name for labels
        })

# Create a DataFrame from the expanded data
expanded_df = pd.DataFrame(expanded_data)

# Ensure that 'labels' is integer
expanded_df['labels'] = expanded_df['labels'].astype(int)

# Split data into train and validation sets
train_df, val_df = train_test_split(expanded_df, test_size=0.2, stratify=expanded_df['labels'])

# Create DataLoaders
train_dataset = TextDataset(train_df)
val_dataset = TextDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Initialize model, optimizer, and scheduler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-uncased", num_labels=len(expanded_df['labels'].unique()))
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * 3  # Number of training steps (3 epochs here)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Train the model
for epoch in range(3):  # Number of epochs
    print(f"Epoch {epoch+1}/{3}")
    train_loss = train_model(train_loader, model, optimizer, scheduler, device)
    print(f"Training loss: {train_loss}")

# Evaluate the model
accuracy, report = evaluate_model(val_loader, model, device)
print(f"Validation Accuracy: {accuracy}")
print("Classification Report:")
print(report)


Token indices sequence length is longer than the specified maximum sequence length for this model (886 > 512). Running this sequence through the model will result in indexing errors
Some weights of the model checkpoint at dbmdz/bert-base-turkish-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that

Epoch 1/3
Training loss: 0.7708279146986493
Epoch 2/3
Training loss: 0.4638282998259795
Epoch 3/3
Training loss: 0.24813800955475387
Validation Accuracy: 0.729957805907173
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.56      0.61        77
           1       0.74      0.86      0.80       138
           2       0.85      0.50      0.63        22

    accuracy                           0.73       237
   macro avg       0.76      0.64      0.68       237
weighted avg       0.73      0.73      0.72       237



In [4]:
model_path = "bert_trained_model.pt"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to bert_trained_model.pt


In [5]:
# Function to load the saved model
def load_model(model_path, device):
    model = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-uncased", num_labels=len(expanded_df['labels'].unique()))
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    return model

# Load the saved model
model_path = "bert_trained_model.pt"
model = load_model(model_path, device)


Some weights of the model checkpoint at dbmdz/bert-base-turkish-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [None]:
from collections import defaultdict, Counter
from transformers import BertTokenizer, BertForSequenceClassification

# Function to predict on unseen data
def predict_on_unseen_data(model, tokenizer, sample_texts):
    model.eval()
    predictions = []
    indices = []
    
    with torch.no_grad():
        for idx, text in enumerate(sample_texts):
            tokenized_chunks = chunk_text(text, tokenizer)
            for input_ids, attention_mask in zip(tokenized_chunks['input_ids'], tokenized_chunks['attention_mask']):
                inputs = {
                    'input_ids': torch.tensor([input_ids], dtype=torch.long).to(device),
                    'attention_mask': torch.tensor([attention_mask], dtype=torch.long).to(device)
                }
                outputs = model(**inputs)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=-1).cpu().numpy()[0]
                predictions.append(preds)
                indices.append(idx)
                
    return predictions, indices

# Majority voting function
def majority_vote(preds):
    count = Counter(preds)
    most_common = count.most_common(1)[0][0]
    return most_common

# Load the dataset
tr_minutes = pd.read_csv('../data/tr_minutes_tks.csv')

# Extract the sample texts
sample_texts = tr_minutes['speech'].tolist()

# Make predictions with indexing
predictions, indices = predict_on_unseen_data(model, tokenizer, sample_texts)

# Aggregate predictions using majority voting
score_dict = defaultdict(list)
for pred, idx in zip(predictions, indices):
    score_dict[idx].append(pred)


In [15]:
sentiment_map = {
    0: 'NEGATIVE',
    1: 'NEUTRAL',
    2: 'POSITIVE'
}
# Compute final predictions using majority voting
final_scores = []
final_labels = []

for idx in range(len(sample_texts)):
    if idx in score_dict:
        chunk_preds = score_dict[idx]
        final_score = majority_vote(chunk_preds)
        final_scores.append(final_score)
        final_labels.append(sentiment_map[final_score])
    else:
        final_scores.append(0)  # Default value if no prediction (should not happen)
        final_labels.append('NEUTRAL')  # Default value

# Ensure the length matches
assert len(final_scores) == len(tr_minutes)

# Add predictions to the DataFrame
tr_minutes['sentiment_score'] = final_scores
tr_minutes['sentiment'] = final_labels

Unnamed: 0,Speaker_name,Speaker_role,Speaker_MP,Speaker_party,Party_status,Party_orientation,Speaker_gender,Date,speech,econ_counts,gov_counts,id_counts,sec_counts,sentiment_score,sentiment
6,Kemal Kılıçdaroğlu,Regular,MP,CHP,Opposition,Centre-left,M,2011-07-11,"Değerli milletvekilleri, hepinizi saygıyla sel...",4,15,35,36,0,NEGATIVE
14,İzzet Çetin,Regular,MP,CHP,Opposition,Centre-left,M,2011-07-15,Hiçbir cümlem yalan değil. Değerli arkadaşları...,1,2,4,7,0,NEGATIVE
16,Faruk Çelik,Regular,MP,AKP,Coalition,Right,M,2011-07-15,İhanet ettiğimiz kesimler… Onlar bize bu oyu v...,6,0,0,2,0,NEGATIVE
31,Ferit Mevlüt Aslanoğlu,Regular,MP,CHP,Opposition,Centre-left,M,2011-10-27,"Ben, tüm Parlamentoya hitap ediyorum, yeni gel...",4,1,27,12,0,NEGATIVE
39,Ahmet Toptaş,Regular,MP,CHP,Opposition,Centre-left,M,2011-11-02,“Onların Schengen’i varsa bizim de Şamgenimiz ...,0,0,1,3,0,NEGATIVE


In [16]:
# Save the DataFrame with predictions
tr_minutes.to_csv('../data/tr_minutes_tks.csv', index=False)

In [7]:
import pandas as pd
tr_minutes = pd.read_csv('../data/tr_minutes_tks.csv')

In [16]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Convert the Date column to datetime format
tr_minutes['Date'] = pd.to_datetime(tr_minutes['Date'])

# Replace BDP, BDP;DBP, DBP with HDP in Speaker_party
tr_minutes['Speaker_party'] = tr_minutes['Speaker_party'].replace(
    {"BDP": "HDP", "BDP;DBP": "HDP", "DBP": "HDP"}
)

# Filter to include only the specified parties
parties_of_interest = ['AKP', 'HDP', 'IYI', 'MHP', 'CHP']
tr_minutes = tr_minutes[tr_minutes['Speaker_party'].isin(parties_of_interest)]

# Filter the data for IYI party starting from 2017
tr_minutes = tr_minutes[~((tr_minutes['Speaker_party'] == 'IYI') & (tr_minutes['Date'].dt.year < 2017))]

# Create a month-year column for grouping
tr_minutes['MonthYear'] = tr_minutes['Date'].dt.to_period('M')

# Filter positive and negative speeches
positive_speeches = tr_minutes[tr_minutes['sentiment'] == 'POSITIVE']
negative_speeches = tr_minutes[tr_minutes['sentiment'] == 'NEGATIVE']

# Group by MonthYear and Speaker_party and count the number of speeches
positive_counts = positive_speeches.groupby(['MonthYear', 'Speaker_party']).size().reset_index(name='Positive Speeches')
negative_counts = negative_speeches.groupby(['MonthYear', 'Speaker_party']).size().reset_index(name='Negative Speeches')

# Get unique months and parties
months = tr_minutes['MonthYear'].unique()
parties = tr_minutes['Speaker_party'].unique()

# Create a complete grid of MonthYear and Speaker_party
complete_grid = pd.MultiIndex.from_product([months, parties], names=['MonthYear', 'Speaker_party']).to_frame(index=False)

# Merge this complete grid with positive and negative counts, filling NaNs with 0
positive_counts = complete_grid.merge(positive_counts, on=['MonthYear', 'Speaker_party'], how='left').fillna(0)
negative_counts = complete_grid.merge(negative_counts, on=['MonthYear', 'Speaker_party'], how='left').fillna(0)

# Ensure the 'Positive Speeches' and 'Negative Speeches' columns are integers
positive_counts['Positive Speeches'] = positive_counts['Positive Speeches'].astype(int)
negative_counts['Negative Speeches'] = negative_counts['Negative Speeches'].astype(int)

positive_counts['MonthYear'] = positive_counts['MonthYear'].dt.to_timestamp()
negative_counts['MonthYear'] = negative_counts['MonthYear'].dt.to_timestamp()

# Define party colors
party_colors = {
    'AKP': 'blue',
    'HDP': 'green',
    'IYI': 'orange',
    'MHP': 'red',
    'CHP': 'darkred'
}

# Calculate the maximum y-axis value
yaxis_max = max(positive_counts['Positive Speeches'].max(), negative_counts['Negative Speeches'].max()) + 2

# Create subplots
fig = make_subplots(rows=1, cols=2, subplot_titles=('Positive Speeches', 'Negative Speeches'))

# Add positive speeches plot
for party in parties_of_interest:
    party_data = positive_counts[positive_counts['Speaker_party'] == party]
    fig.add_trace(go.Scatter(x=party_data['MonthYear'], y=party_data['Positive Speeches'], 
                             mode='lines', name=party, line=dict(color=party_colors[party]), legendgroup=party),
                  row=1, col=1)

# Add negative speeches plot
for party in parties_of_interest:
    party_data = negative_counts[negative_counts['Speaker_party'] == party]
    fig.add_trace(go.Scatter(x=party_data['MonthYear'], y=party_data['Negative Speeches'], 
                             mode='lines', name=party, line=dict(color=party_colors[party]), legendgroup=party, showlegend=False),
                  row=1, col=2)

# Update layout
fig.update_layout(
    title={
        'text': 'Positive and Negative Speeches per Party by Month-Year', 
        'x': 0.5, 
        'xanchor': 'center'
    },
    xaxis_title={
        'text': 'Year',
        'standoff': 10,  # Adjusts vertical distance from the plot
    },
    yaxis_title='Number of Speeches',
    showlegend=True,
    height=600, 
    width=1200,
    yaxis=dict(range=[0, yaxis_max]),  # Apply range to the first subplot's y-axis
    yaxis2=dict(range=[0, yaxis_max]),  # Apply range to the second subplot's y-axis
)

# Center the x-axis title for both subplots
fig.update_xaxes(
    nticks=20,
    title_text='Year',
    title_standoff=20,  # Adjust this for vertical space
    title_font=dict(size=14),
    row=1, col=1  # Apply to the first subplot
)

fig.update_xaxes(
    nticks=20,
    title_text='Year',
    title_standoff=20,  # Apply same standoff for consistency
    title_font=dict(size=14),
    row=1, col=2  # Apply to the second subplot
)

fig.show()



In [15]:

# Create a year column for grouping
tr_minutes['Year'] = tr_minutes['Date'].dt.year

# Group by Year and sentiment, and count the number of speeches
sentiment_counts = tr_minutes.groupby(['Year', 'sentiment']).size().reset_index(name='Count')

# Pivot the data to have sentiments as columns
sentiment_pivot = sentiment_counts.pivot(index='Year', columns='sentiment', values='Count').fillna(0).reset_index()

# Define sentiment colors
sentiment_colors = {
    'POSITIVE': 'green',
    'NEUTRAL': 'blue',
    'NEGATIVE': 'red'
}

# Create a stacked bar chart
fig5 = go.Figure()

# Add bars for each sentiment
for sentiment in ['POSITIVE', 'NEUTRAL', 'NEGATIVE']:
    fig5.add_trace(go.Bar(
        x=sentiment_pivot['Year'],
        y=sentiment_pivot[sentiment],
        name=sentiment.capitalize(),
        marker_color=sentiment_colors[sentiment]
    ))

fig5.update_layout(
    barmode='stack',
    title={
        'text': 'Türkiye', 
        'x': 0.5, 
        'xanchor': 'center', 
        'yanchor': 'top'
    }, 
    xaxis_title='Year',  # Simple title text without dict for natural centering
    yaxis_title='Number of Speeches',
    showlegend=True,
    legend=dict(
        x=0.5,
        y=-0.2,
        xanchor='center',
        yanchor='middle',
        orientation='h'
    ),
    height=600,
    width=1000
)

fig5.show()



In [17]:
import pandas as pd
import plotly.graph_objects as go
from statsmodels.nonparametric.smoothers_lowess import lowess

# Calculate net sentiment scores
tr_minutes['Net Sentiment'] = tr_minutes['sentiment'].map({'NEGATIVE': -1, 'NEUTRAL': 0, 'POSITIVE': 1})

# Group by MonthYear and calculate net sentiment
net_sentiment = tr_minutes.groupby(['MonthYear'])['Net Sentiment'].mean().reset_index()

# Convert MonthYear back to datetime for plotting
net_sentiment['MonthYear'] = net_sentiment['MonthYear'].dt.to_timestamp()

# Create a new figure for net sentiment
fig2 = go.Figure()

# Add trace for net sentiment
fig2.add_trace(go.Scatter(x=net_sentiment['MonthYear'], y=net_sentiment['Net Sentiment'], 
                          mode='lines', name='Net Sentiment',
                          line=dict(color='blue', dash='dash')))

# Apply LOWESS smoothing
lowess_smoothed = lowess(net_sentiment['Net Sentiment'], net_sentiment['MonthYear'], frac=0.1)
lowess_x = pd.to_datetime(lowess_smoothed[:, 0])
lowess_y = lowess_smoothed[:, 1]

# Add trace for smoothed trend
fig2.add_trace(go.Scatter(x=lowess_x, y=lowess_y, 
                          mode='lines', name='Smoothed Trend',
                          line=dict(color='red', width=3)))


# Update layout for fig2
fig2.update_layout(title='Net Sentiment over Time', 
                   xaxis_title='Year', 
                   yaxis_title='Net Sentiment Score',
                   showlegend=True,
                   legend=dict(
                       x=0.5,  
                       y=-0.2,  
                       xanchor='center', 
                       yanchor='middle', 
                       orientation='h'  
                   ),
                   height=600, width=1000)

fig2.update_xaxes(nticks=20)

fig2.show()

In [22]:
import pandas as pd
import plotly.graph_objects as go
from statsmodels.nonparametric.smoothers_lowess import lowess


# Convert the Date column to datetime format
tr_minutes['Date'] = pd.to_datetime(tr_minutes['Date'])

# Create a MonthYear column for grouping
tr_minutes['MonthYear'] = tr_minutes['Date'].dt.to_period('M')

# Count the number of EU-specific speeches per month
eu_speech_count = tr_minutes.groupby(['MonthYear']).size().reset_index(name='EU Speech Count')

# Convert MonthYear back to datetime for plotting
eu_speech_count['MonthYear'] = eu_speech_count['MonthYear'].dt.to_timestamp()

# Create a new figure for the number of EU-specific speeches
fig4 = go.Figure()

# Add trace for the number of EU-specific speeches
fig4.add_trace(go.Scatter(x=eu_speech_count['MonthYear'], y=eu_speech_count['EU Speech Count'], 
                          mode='lines+markers', name='Number of EU Speeches',
                          line=dict(color='blue'), 
                          marker=dict(size=6, color='blue')))

# Apply LOWESS smoothing
lowess_smoothed = lowess(eu_speech_count['EU Speech Count'], eu_speech_count['MonthYear'].map(pd.Timestamp.toordinal), frac=0.1)
lowess_x = pd.to_datetime([pd.Timestamp.fromordinal(int(d)) for d in lowess_smoothed[:, 0]])
lowess_y = lowess_smoothed[:, 1]

# Add trace for smoothed trend
fig4.add_trace(go.Scatter(x=lowess_x, y=lowess_y, 
                          mode='lines', name='Smoothed Trend',
                          line=dict(color='red', width=3)))

# Define x-axis range and ticks more robustly
xaxis_range = [pd.to_datetime('2011-03-31'), pd.to_datetime('2022-12-31')]
xaxis_ticks = [pd.to_datetime(f'{year}-01-01') for year in range(2011, 2024)]

# Update layout for fig4
fig4.update_layout(
    title={
        'text': 'Monthly Number of EU References in the Serbian Parliament (2011-2023)', 
        'x': 0.5, 
        'xanchor': 'center', 
        'yanchor': 'top'
    }, 
    xaxis=dict(
        title='Year',
        range=xaxis_range,
        tickmode='array',
        tickvals=xaxis_ticks,
        ticktext=[str(year) for year in range(2011, 2024)]
    ),
    yaxis_title='Number of EU Speeches',
    showlegend=True,
    legend=dict(
        x=0.5,  
        y=-0.2,  
        xanchor='center', 
        yanchor='middle', 
        orientation='h'  
    ),
    height=600, width=1000
)

fig4.update_xaxes(nticks=20)

fig4.show()


In [19]:
import pandas as pd
file_path_all = '../data/pre-processed_tr.csv'
file_path_eu = '../data/tr_minutes_tks.csv'
tr_minutes_all = pd.read_csv(file_path_all)
tr_minutes_all['SpeechLength'] = tr_minutes_all['speech'].apply(len)

# Filter speeches where the length is more than 10
tr_minutes_all = tr_minutes_all[tr_minutes_all['SpeechLength'] > 10]
tr_minutes_eu = pd.read_csv(file_path_eu)

# Convert the Date columns to datetime format
tr_minutes_all['Date'] = pd.to_datetime(tr_minutes_all['Date'])
tr_minutes_eu['Date'] = pd.to_datetime(tr_minutes_eu['Date'])

# Create MonthYear columns for grouping
tr_minutes_all['MonthYear'] = tr_minutes_all['Date'].dt.to_period('M')
tr_minutes_eu['MonthYear'] = tr_minutes_eu['Date'].dt.to_period('M')

# Count the number of total speeches and EU-specific speeches per month
total_speech_count = tr_minutes_all.groupby(['MonthYear']).size().reset_index(name='Total Speech Count')
eu_speech_count = tr_minutes_eu.groupby(['MonthYear']).size().reset_index(name='EU Speech Count')

# Merge the counts based on MonthYear
speech_counts = pd.merge(total_speech_count, eu_speech_count, on='MonthYear', how='left').fillna(0)

# Convert MonthYear back to datetime for plotting
speech_counts['MonthYear'] = speech_counts['MonthYear'].dt.to_timestamp()

# Calculate the proportion of EU-specific speeches to total speeches
speech_counts['EU Speech Proportion'] = speech_counts['EU Speech Count'] / speech_counts['Total Speech Count']

# Create a new figure for the proportion of EU-specific speeches
fig3 = go.Figure()

# Add trace for the proportion of EU-specific speeches
fig3.add_trace(go.Scatter(x=speech_counts['MonthYear'], y=speech_counts['EU Speech Proportion'], 
                          mode='lines', name='Proportion of EU Speeches',
                          line=dict(color='blue', dash='dash')))

# Apply LOWESS smoothing
lowess_smoothed = lowess(speech_counts['EU Speech Proportion'], speech_counts['MonthYear'].map(pd.Timestamp.toordinal), frac=0.1)
lowess_x = pd.to_datetime([pd.Timestamp.fromordinal(int(d)) for d in lowess_smoothed[:, 0]])
lowess_y = lowess_smoothed[:, 1]

# Add trace for the smoothed trend
fig3.add_trace(go.Scatter(x=lowess_x, y=lowess_y, 
                          mode='lines', name='Smoothed Trend',
                          line=dict(color='red', width=3)))

# Define x-axis range and ticks
xaxis_range = [pd.to_datetime('2010-10-01'), pd.to_datetime('2022-08-30')]
xaxis_ticks = [pd.to_datetime(f'{year}-01-01') for year in range(2011, 2023)]

# Update layout for fig3
fig3.update_layout(
    title={
        'text': 'Monthly Proportions of EU References Relative to Total Speeches in the Turkish National Assembly (2011-2022)', 
        'x': 0.5, 
        'xanchor': 'center', 
        'yanchor': 'top'
    }, 
    xaxis=dict(
        range=xaxis_range,
        tickmode='array',
        tickvals=xaxis_ticks,
        ticktext=[str(year) for year in range(2011, 2023)]
    ),
    yaxis_title='Proportion of EU Speeches',
    showlegend=False,
    height=600, width=1000
)

fig3.update_xaxes(nticks=20)

fig3.show()
