#### Pre-processing

In [1]:
import pandas as pd 
import stanza
import re
eu_references_rs = pd.read_csv('../data/eu_references_rs.csv')
eu_references_tr = pd.read_csv('../data/eu_references_tr.csv')

In [2]:
# Load stopwords for Serbian
stopwords_file = '../data/serbian.txt'
with open(stopwords_file, 'r', encoding='utf-8') as f:
    stopwords_list_rs  = set([line.strip() for line in f])
    
# Load stopwords for Turkish
stopwords_file = '../data/stopwords-tr.txt'
with open(stopwords_file, 'r', encoding='utf-8') as f:
    stopwords_list = [line.strip() for line in f]

# Load proper nouns for Turkish
proper_nouns_file = '../data/isimler.txt'
with open(proper_nouns_file, 'r', encoding='utf-8') as f:
    proper_nouns = [line.strip().lower() for line in f]
    # Extend stopwords with proper nouns
stopwords_list.extend(proper_nouns)

In [3]:
from TurkishStemmer import TurkishStemmer
from nltk.tokenize import word_tokenize
# Function to preprocess Turkish text
def preprocess_text(text, stopwords_list, stemmer):
    
    text = re.sub(r'[^\w\s]', '', text) 
    text = re.sub(r'\d+', '', text)
    
    tokens = word_tokenize(text)
    
    tokens = [stemmer.stem(token.lower()) for token in tokens if token.lower() not in stopwords_list]
    
    return tokens

In [10]:
# Preprocessing for Serbian
def preprocess_text(text, stopwords_list, nlp_pipeline):
    # Check if text is NaN
    if pd.isna(text):
        return []
    
    # Remove punctuation and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Process text with Stanza
    doc = nlp_pipeline(text)
    
    # Extract lemmas and remove stopwords
    tokens = [word.lemma.lower() for sentence in doc.sentences for word in sentence.words if word.lemma.lower() not in stopwords_list_rs]
    return tokens

def process_rows(df, stopwords_list, nlp_pipeline):
    df['processed_speech'] = df['speech'].apply(lambda x: preprocess_text(x, stopwords_list_rs, nlp_pipeline))
    df.update(df)
    return df

In [8]:
# Function to count tokens in each list
def count_tokens(tokens_list, tokens_to_count):
    return sum(token in tokens_to_count for token in tokens_list)

-----

In [None]:
stemmer = TurkishStemmer()

eu_references_tr['tokens'] = eu_references_tr['speech'].apply(lambda x: preprocess_text(x, stopwords_list, stemmer))


In [None]:
import stanza
nlp = stanza.Pipeline(lang='sr', processors='tokenize,lemma')
rs_minutes_tks = process_rows(eu_references_rs, stopwords_list_rs, nlp)

print(rs_minutes_tks)

#### ECONOMIC TOKENS FOR SERBIA

In [3]:
serbian_econ = [
    'siromaštvo', 'ekonomski', 'fond', 'budžet', 'finansijski', 'tržište', 'dinar', 'evro', 'MMF',
    'sektor', 'resursi', 'dug', 'penzija', 'kredit', 'investicija', 'preduzeće', 'incentiv', 'PDV',
    'plata', 'proizvod', 'proizvodnja', 'Investbanka', 'infrastruktura', 'investitor',
    'uvoz', 'rad', 'trgovina','procvat', 'valuta', 'milion', 'porez', 'banka''radna snaga',
    'industrijski', 'plaćanje', 'trošak', 'pomoć', 'rast', 'eurozona',
    'izvoz', 'nezaposlenost', 'fiskalni', 'plaćanje', 'energija']

In [9]:
# Apply the function to each row in the processed_speech column
rs_minutes_tks['econ_counts'] = rs_minutes_tks['processed_speech'].apply(lambda x: count_tokens(x, serbian_econ))

#### GOVERNANCE TOKENS FOR SERBIA

In [4]:
serbian_gov = [ 
    "sistem", "ustavni", "lokalna samouprava", "sudija", "predkandidatura", "kandidat", "postupak",
    "demokratija", "demokrata", "pravna tekovina", "zakon", "referendum", "akt", "pravni",
    "usklađenost", "sud", "kriterijum", "zakonodavstvo", "korumpiran", "veto", "kabinet", 
    "autoritet", "arbitraža", "diplomatija", "pravno", "preduslov", "sticanje",
    "nadzor", "administracija", "koalicija", "birokratija", "transparentnost","povelja",  
    "vladavina prava", "nedeokratski", "izbori", "glasanje", "civilni", "režim", 
    "regulacija", "pravda", "suverenitet", "ilegalan", "tužba", "proces"
]


In [11]:
rs_minutes_tks['gov_counts'] = rs_minutes_tks['processed_speech'].apply(lambda x: count_tokens(x, serbian_gov))

#### IDENTITY TOKENS FOR SERBIA


In [5]:
serbian_id = ["narodni", "držav", "zastava", "ideologija", "panevropski", "albanski", "Kosovo", "identitet", "Slav"
              "ljudi", "nacija", "istorija", "čovek", "istorijski", "građanin", "granica", "kultura", "etnički",
              "socijalistički", "fašista", "fašistički", "nacionalista", "nacionalistički", "izbeglica",
              "azil", "nacionalnost", "migranti", "rasizam", "viza", "sekularan", "ksenofobija", "religija", "verski"]


In [13]:
rs_minutes_tks['id_counts'] = rs_minutes_tks['processed_speech'].apply(lambda x: count_tokens(x, serbian_id))

#### SECURITY TOKENS FOR SERBIA

In [6]:
serbian_sec = [  
  "vanredan", "snaga", "manjina", "strana", "strategije", "doktrine", "misije", "bezbednosni", "Samit", "vojni", "paravojni",
  "terorizam", "teroristički", "terorista", "teroristi", "Eulex", "pakta", "pakt", "paktom", "konflikata"
    "rat", "mir", "izgradnja mira", "bezbednost", "pretnja", "pretiti", "kriv", "osumnjičeni", "sajber", "borba", "sporazum",
    "zločin", "odbrana", "ekstreman", "trupa", "siguran", "bomba", "nestabilan", "stražar", "prekid vatre", "izbeglice",
    "invazija", "teror", "antiterorizam", "UN", "NATO", "Ženeva", "borba", "nasilje", "okupacija", "genocid", "nadzor"]

In [15]:
rs_minutes_tks['sec_counts'] = rs_minutes_tks['processed_speech'].apply(lambda x: count_tokens(x, serbian_sec))

In [16]:
rs_minutes_tks.to_csv('../data/rs_minutes_tks.csv', index=False)

-----------------


#### ECONOMIC TOKENS FOR TURKEY

In [37]:
turkish_econ = [
    "turizm", "tarım", "enflasyon", "döviz", "finans", "gümrük", "iflas", "lira", "dolar",  "mevduat", 
    "euro", "açık" , "kapitalist", "asgari", "aylık", "emek", "emekli", "merkez", "banka", "borsa", "fabrika"
    "ithal", "ihtalat", "iş", "meslek", "ticaret", "pazar", "büyüme", "para", "fon", "milyon", 
     "işgücü", "bütçe", "endüstriyel", "maliyet", "yardım", "büyüme", "kâr", "avro", "ihraç", "kredi",
    "ihracat", "işsizlik", "borç", "mali", "ödeme", "vergi", "enerji", "yoksulluk"
 ]

In [57]:
eu_references_tr['econ_counts'] = eu_references_tr['tokens'].apply(lambda x: count_tokens(x, turkish_econ))
display(eu_references_tr.sort_values(by='econ_counts', ascending=False).head(10))

Unnamed: 0,Speaker_name,Speaker_role,Speaker_MP,Speaker_party,Party_status,Party_orientation,Speaker_gender,Date,speech,tokens,econ_counts,gov_counts,id_counts,sec_counts
104,Mehmet Şimşek,Regular,MP,AKP,Coalition,Right,M,2011-12-08,17 Ekim 2011 tarihinde Türkiye Büyük Millet Me...,"[ek, tarih, türki, büyük, millet, meç, sunulan...",273,10,6,10
4137,Berat Albayrak,Regular,notMP,AKP,Coalition,Right,M,2018-12-10,2019 Yılı Merkezi Yönetim Bütçe Kanunu Teklifi...,"[yıl, merkez, yöne, bütç, kanun, teklif, yıl, ...",255,9,7,11
736,Mehmet Şimşek,Regular,MP,AKP,Coalition,Right,M,2012-12-10,17 Ekim 2012 tarihinde Türkiye Büyük Millet Me...,"[ek, tarih, türki, büyük, millet, meç, sunulan...",254,13,7,11
2005,Mehmet Şimşek,Regular,MP,AKP,Coalition,Right,M,2014-12-10,Bugün 2013 yılı Merkezi Yönetim Kesin Hesap Ka...,"[bugü, yıl, merkez, yöne, kes, hesap, kanun, t...",246,13,8,9
4465,Fuat Oktay,Regular,notMP,AKP,Coalition,Right,M,2019-12-09,Orta Doğu’daki petrol üreticilerine yönelik je...,"[or, doğu, petrol, üretici, yönelik, jeopoli, ...",238,9,11,18
4868,Fuat Oktay,Regular,notMP,AKP,Coalition,Right,M,2020-12-07,"2021 Yılı Merkezi Yönetim Bütçe Kanun Teklifi,...","[yıl, merkez, yöne, bütç, kanu, teklif, plan, ...",230,13,11,18
1654,Ayşenur Külahlıoğlu İslam,Regular,MP,AKP,Coalition,Right,F,2014-04-29,Sayın Reşat Doğru’nun sorusuyla başlıyorum. Sa...,"[say, doğr, soru, başlıyor, say, noter, ücret,...",230,8,9,12
5221,Fuat Oktay,Regular,notMP,AKP,Coalition,Right,M,2021-12-06,Bütçelerimiz Cumhurbaşkanımız liderliğinde ülk...,"[bütçe, cumhurbaşkan, liderlik, ülke, bugün, e...",225,8,9,8
3,Recep Tayyip Erdoğan,Regular,MP,AKP,Coalition,Right,M,2011-07-08,"Bu vesileyle, başta cumhuriyetimizin kurucusu ...","[vesile, baş, cumhuriyet, kurucu, gazi, atatür...",190,20,17,20
1402,Mehmet Şimşek,Regular,MP,AKP,Coalition,Right,M,2013-12-10,"Devletin teşkilat yapısına, hazine birliği ilk...","[devl, teşkilat, yapı, hazin, birlik, ilke, ge...",189,11,4,6


#### GOVERNANCE TOKENS FOR TURKEY


In [39]:
turkish_gov = [
    "müktesebat", "ihale", " politika", "yasa", "kamu", "önerge", "komisyon", "ortak", 'rapor',
    "kriter", "demokrasi","demokrat", "kazanım", "referandum", "kanun", "hukuki", "mahkeme", "direktif",
    "kriter", "yasama", "yolsuzluk", "veto", "kabine", "şartname", "otokrat", "arabulucu", "anayasa", 
    "diplomasi", "önkoşul", "denetim", "yönetim", "koalisyon", "bürokrasi", "şeffaflık", "dava", "tüzük",
     "seçim", "sivil","rejim", "düzenleme","adalet", "egemenlik", "yasadışı" 
]

In [None]:
eu_references_tr['gov_counts'] = eu_references_tr['tokens'].apply(lambda x: count_tokens(x, turkish_gov))


#### IDENTITY TOKENS FOR TURKEY


In [41]:
turkish_id = [
    "kürt", "millî", "halk", "toplumsal", "müslüman", "islam", "aile", "avrupacılık", "insanlık",
    "uyum", "entegrasyon", "yabancı", "gelenek", "dil", "azınlık", "diaspora", "entegrasyon",
    "toplum", "ulus", "tarih", "insan", "tarihi", "vatandaş", "sınır", "kültür", "sosyalist", "faşist",
    "milliyetçi", "mülteci", "sığınma", "göçmen", "bayrak", "ırkçılık", "vize", "laik", "adet",
    "din", "kimlik", "dini"
    ]

In [None]:
eu_references_tr['id_counts'] = eu_references_tr['tokens'].apply(lambda x: count_tokens(x, turkish_id))

#### SECURITY TOKENS FOR TURKEY

In [43]:
turkish_sec = [
    "terör", "örgüt", "suriye", "silah", "kuvvet", "sınır", "devlet", "asker", "irak", "güvenlik", 
    "askerî", "güç", "saldırı", "silahlı", "darbe", "çatışma", "ordu", "özgürlük", "cezaevi", 
    "hapishane", "şehit", "gözaltı", "fail", "katliam",  "suç", "dava", "cinayet", "fetö", 
    "savaş", "barış", "güvenlik", "stabil", "dayanıklılık", "savunma", "ekstrem", "tehdit", 
    "suç", "suçlu", "instabil", "hapis", "birlik", "emniyet", "bomba", "koruma", "soykırım",
    "saldırgan", "tutuklu", "katil", "şiddet", "ateşkes", "istila", "soykırım", 
    "savaşmak", "düşman", "Geneva", "un", "nato", "terörist", "kaçak", "kaçakçılık",  "işgal"
]

In [None]:
eu_references_tr['sec_counts'] = eu_references_tr['tokens'].apply(lambda x: count_tokens(x, turkish_sec))
display(eu_references_tr.sort_values(by='sec_counts', ascending=False).head(10))

In [60]:
#write as csv file
eu_references_tr.drop(columns=['tokens']).to_csv('../data/tr_minutes_tks.csv', index=False)

In [4]:
non_eu_turkish = pd.read_csv('../data/diff_df.csv')

In [49]:
non_eu_turkish['id_counts'] = non_eu_turkish['tokens'].apply(lambda x: count_tokens(x, turkish_id))
non_eu_turkish['econ_counts'] = non_eu_turkish['tokens'].apply(lambda x: count_tokens(x, turkish_econ))
non_eu_turkish['gov_counts'] = non_eu_turkish['tokens'].apply(lambda x: count_tokens(x, turkish_gov))
non_eu_turkish['sec_counts'] = non_eu_turkish['tokens'].apply(lambda x: count_tokens(x, turkish_sec))
non_eu_turkish.drop(columns=['tokens']).to_csv('../data/non_eu_turkish.csv', index=False)

In [None]:
import pandas as pd
import re
import stanza
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load stopwords
stopwords_file = '../data/serbian.txt'
with open(stopwords_file, 'r', encoding='utf-8') as f:
    stopwords_list_rs = set(line.strip().lower() for line in f)  # Ensure stopwords are in lowercase

# Initialize Stanza pipeline
nlp = stanza.Pipeline(lang='sr', processors='tokenize,lemma')

def preprocess_text(text, stopwords_list, nlp_pipeline):
    # Check if text is NaN
    if pd.isna(text):
        return []
    
    # Remove punctuation and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Process text with Stanza
    doc = nlp_pipeline(text)
    
    # Extract lemmas and remove stopwords
    tokens = [
        word.lemma.lower() for sentence in doc.sentences
        for word in sentence.words
        if word.lemma.lower() not in stopwords_list
    ]
    return tokens

def process_texts_batch(texts, stopwords_list, nlp_pipeline):
    return [preprocess_text(text, stopwords_list, nlp_pipeline) for text in texts]

def process_rows(df, stopwords_list, nlp_pipeline, batch_size=100):
    num_batches = (len(df) + batch_size - 1) // batch_size
    results = []
    
    # Process each batch in parallel
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_texts_batch, df['speech'].iloc[i*batch_size:(i+1)*batch_size].tolist(), stopwords_list, nlp_pipeline)
                   for i in range(num_batches)]
        
        for future in as_completed(futures):
            results.extend(future.result())
    
    df['processed_speech'] = results
    return df

# Read the first 10 rows for demonstration
non_eu_serbian = pd.read_csv('../data/non_eu_speeches_rs.csv')
non_eu_serbian = non_eu_serbian[non_eu_serbian['speech'].str.len() > 100]
# sample non_eu_serbian as the size of eu_references_rs 
sample_size = len(eu_references_rs)

# Randomly sample rows from non_eu_serbian
sampled_non_eu_serbian = non_eu_serbian.sample(n=sample_size, random_state=42)
# Process rows with parallel processing
sampled_non_eu_serbian = process_rows(sampled_non_eu_serbian, stopwords_list_rs, nlp)

In [9]:
sampled_non_eu_serbian['id_counts'] = sampled_non_eu_serbian['processed_speech'].apply(lambda x: count_tokens(x, serbian_id))
sampled_non_eu_serbian['econ_counts'] = sampled_non_eu_serbian['processed_speech'].apply(lambda x: count_tokens(x, serbian_econ))
sampled_non_eu_serbian['gov_counts'] = sampled_non_eu_serbian['processed_speech'].apply(lambda x: count_tokens(x, serbian_gov))
sampled_non_eu_serbian['sec_counts'] = sampled_non_eu_serbian['processed_speech'].apply(lambda x: count_tokens(x, serbian_sec))

In [None]:
sampled_non_eu_serbian.drop(columns=['processed_speech']).to_csv('../data/non_eu_serbian.csv', index=False)

In [15]:
import pandas as pd
import plotly.graph_objects as go
from statsmodels.nonparametric.smoothers_lowess import lowess

# Read CSV file
tr_mins = pd.read_csv('../data/tr_minutes_tks.csv')

# Convert 'Date' column to datetime
tr_mins['Date'] = pd.to_datetime(tr_mins['Date'])
tr_mins['MonthYear'] = tr_mins['Date'].dt.to_period('M')

# Total speeches per month
total_speeches = tr_mins.groupby(['MonthYear']).size().reset_index(name='Total Speeches')

# Filter for IPA speeches
ipa_speeches = tr_mins[
    tr_mins['speech'].str.contains('IPA', regex=True) | tr_mins['speech'].str.contains('Erasmus', regex=True) | tr_mins['speech'].str.contains('Horizon', regex=True) |
    tr_mins['speech'].str.contains('Katılım Öncesi Mali Yardım Aracı', regex=True)
]
ipa_speech_count = ipa_speeches.groupby(['MonthYear']).size().reset_index(name='IPA Speech Count')

# Merge IPA speech count with total speech count
speech_count = pd.merge(total_speeches, ipa_speech_count, on='MonthYear', how='left').fillna(0)

# Calculate the ratio of IPA references to total speeches
speech_count['IPA Ratio'] = speech_count['IPA Speech Count'] / speech_count['Total Speeches']

# Convert MonthYear back to datetime for plotting
speech_count['MonthYear'] = speech_count['MonthYear'].dt.to_timestamp()

# Create a new figure for ratio of IPA references
fig3 = go.Figure()

# Add trace for ratio of IPA references
fig3.add_trace(go.Scatter(x=speech_count['MonthYear'], y=speech_count['IPA Ratio'], 
                          mode='lines', name='IPA Reference Ratio',
                          line=dict(color='blue', dash='dash')))

# Define xaxis range and ticks
xaxis_range = [pd.to_datetime('2010-10-01'), pd.to_datetime('2022-08-30')]
xaxis_ticks = [pd.to_datetime(f'{year}-01-01') for year in range(2011, 2023)]


fig3.update_layout(title={
    'text': 'IPA References Ratio in the Turkish Parliament (2011-2022)', 
    'x': 0.5,
    'xanchor': 'center', 
    'yanchor': 'top' 
}, 
    xaxis_title='Year', 
    yaxis_title='Number of IPA-related Speeches',
    xaxis=dict(
        range=xaxis_range,
        tickmode='array',
        tickvals=xaxis_ticks,
        ticktext=[str(year) for year in range(2011, 2023)]
    ),
    showlegend=True,
    height=600, width=1000
)

fig3.update_xaxes(nticks=20)

fig3.show()


In [16]:
import pandas as pd
import plotly.graph_objects as go
from statsmodels.nonparametric.smoothers_lowess import lowess

# Read CSV file
rs_mins = pd.read_csv('../data/rs_minutes_tks.csv')

# Convert 'Date' column to datetime
rs_mins['Date'] = pd.to_datetime(rs_mins['Date'])
rs_mins['MonthYear'] = rs_mins['Date'].dt.to_period('M')

# Total speeches per month
total_speeches = rs_mins.groupby(['MonthYear']).size().reset_index(name='Total Speeches')

# Filter for IPA speeches
ipa_speeches = rs_mins[
    rs_mins['speech'].str.contains('IPA', regex=True) | rs_mins['speech'].str.contains('Erasmus', regex=True) | rs_mins['speech'].str.contains('Horizon', regex=True) |
    rs_mins['speech'].str.contains('Instrument pretpristupne finansijske pomoći', regex=True)
]
ipa_speech_count = ipa_speeches.groupby(['MonthYear']).size().reset_index(name='IPA Speech Count')

# Merge IPA speech count with total speech count
speech_count = pd.merge(total_speeches, ipa_speech_count, on='MonthYear', how='left').fillna(0)

# Calculate the ratio of IPA references to total speeches
speech_count['IPA Ratio'] = speech_count['IPA Speech Count'] / speech_count['Total Speeches']

# Convert MonthYear back to datetime for plotting
speech_count['MonthYear'] = speech_count['MonthYear'].dt.to_timestamp()

# Create a new figure for ratio of IPA references
fig3 = go.Figure()

# Add trace for ratio of IPA references
fig3.add_trace(go.Scatter(x=speech_count['MonthYear'], y=speech_count['IPA Ratio'], 
                          mode='lines', name='IPA Reference Ratio',
                          line=dict(color='blue', dash='dash')))

# Define xaxis range and ticks
xaxis_range = [pd.to_datetime('2010-10-01'), pd.to_datetime('2022-08-30')]
xaxis_ticks = [pd.to_datetime(f'{year}-01-01') for year in range(2011, 2023)]

# Update layout for fig3
fig3.update_layout(title={
    'text': 'IPA References Ratio in the Serbian Parliament (2011-2022)', 
    'x': 0.5,  # Center horizontally
    'xanchor': 'center',  # Anchor the title at its center
    'yanchor': 'top'  # Anchor the title at the top
}, 
    xaxis_title='Year', 
    yaxis_title='Number of IPA-related Speeches',  # Fixed unmatched single quote
    xaxis=dict(
        range=xaxis_range,
        tickmode='array',
        tickvals=xaxis_ticks,
        ticktext=[str(year) for year in range(2011, 2023)]
    ),
    showlegend=True,
    height=600, width=1000
)

fig3.update_xaxes(nticks=20)

fig3.show()
