In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
alli_df = pd.read_csv("alliance_results.csv")
info_df = pd.read_csv("info.csv")
party_df = pd.read_csv("party_results.csv")
pres_df = pd.read_csv("presidency_results.csv")
polls = pd.read_csv("polls_for_2023.csv")
polls_2018 = pd.read_csv("polls_2018.csv")
pres_polls = pd.read_csv("presidency_polls_2023.csv")
second_round = pd.read_csv("second_round.csv")

In [3]:
pres_polls.head()

Unnamed: 0,Tarih,Anket şirketi,Örneklem,Erdoğan,İnce,Kılıçdaroğlu,Oğan
0,Mayıs ayı ortalaması,Mayıs ayı ortalaması,40000,4717,343,4673,267
1,28 Nisan-2 Mayıs,İvem,4156,478,33,459,30
2,27 Nisan-2 Mayıs,ASAL,2523,491,27,463,19
3,29 Nisan-1 Mayıs,ORC,3950,446,43,480,31
4,Nisan ayı ortalaması,Nisan ayı ortalaması,40000,4506,544,4717,233


In [4]:
pres_polls.drop(pres_polls.columns[[0,1]], axis=1, inplace=True)
pres_polls.head(), pres_polls.columns

(   Örneklem Erdoğan  İnce Kılıçdaroğlu  Oğan
 0     40000   47,17  3,43        46,73  2,67
 1      4156    47,8   3,3         45,9   3,0
 2      2523    49,1   2,7         46,3   1,9
 3      3950    44,6   4,3         48,0   3,1
 4     40000   45,06  5,44        47,17  2,33,
 Index(['Örneklem', 'Erdoğan', 'İnce', 'Kılıçdaroğlu', 'Oğan'], dtype='object'))

In [5]:
for column in pres_polls.columns:
    if pres_polls[column].dtype == 'object':  # Check if column contains string values
        pres_polls[column] = pres_polls[column].str.replace("+", "").str.replace(",", ".").astype(float)
    else:
        pres_polls[column] = pres_polls[column].astype(float)
pres_polls.head()

  pres_polls[column] = pres_polls[column].str.replace("+", "").str.replace(",", ".").astype(float)


Unnamed: 0,Örneklem,Erdoğan,İnce,Kılıçdaroğlu,Oğan
0,40000.0,47.17,3.43,46.73,2.67
1,4156.0,47.8,3.3,45.9,3.0
2,2523.0,49.1,2.7,46.3,1.9
3,3950.0,44.6,4.3,48.0,3.1
4,40000.0,45.06,5.44,47.17,2.33


In [10]:
def calculate_weight(row, prev_participants=47239370, threshold=1.5, max_weight=1.5):

    # Decrease the weight for every datapoint
    weight = 0.95 - row.name * 0.001

    # Multiply the weight by a factor that increases as the number of participants increases
    weight *= row["Örneklem"] / pres_polls["Örneklem"].mean()

    # Calculate the ratio of the number of participants in the current poll to the previous election
    ratio = row["Örneklem"] / prev_participants

    # Cap the ratio at the threshold value if it is greater than that value
    if ratio > threshold:
        ratio = threshold

    # Add a term that adjusts the weight based on the ratio of the number of participants in the current poll to the previous election
    weight *= 1 + np.log(ratio + 1) * 0.001

    # Cap the weight adjustment term at a maximum value
    if weight > max_weight:
        weight = max_weight

    return weight


# Apply the calculate_weight function to the index of the DataFrame to calculate the weight of each datapoint
#polls["weight"] = polls.index.map(calculate_weight)
prev_participants = 47239370
# Replace with the number of participants in the previous election

pres_polls["weight"] = pres_polls.apply(calculate_weight, args=(prev_participants,), axis=1)

pres_polls["weight"] = pres_polls["weight"].astype(float)



In [11]:
weighted_avg_pres_polls_2023 = pd.DataFrame(index=["weighted_avg"])

for candidate in pres_polls.columns[1:-1]:
    # Exclude rows with missing values or zero values for the current party column
    valid_rows = (pres_polls[candidate].notnull()) & (pres_polls[candidate] != 0)
    # Calculate the weighted average for the valid rows only
    if valid_rows.any():
        weighted_avg = (pres_polls.loc[valid_rows, candidate] * pres_polls.loc[valid_rows, "weight"]).sum() / pres_polls.loc[valid_rows, "weight"].sum()
    else:
        # Calculate the average value of the party column across all rows
        avg_value = pres_polls[candidate].mean()
        # Calculate the average weight of rows where the party column has a value
        avg_weight = pres_polls.loc[pres_polls[candidate].notnull(), "weight"].mean()
        # Calculate the half-weighted average for the missing rows
        weighted_avg = 0.5 * (avg_value + (avg_weight * weighted_avg_pres_polls_2023.loc["weighted_avg", candidate]))
    weighted_avg_pres_polls_2023.loc["weighted_avg", candidate] = weighted_avg

# Normalize the values to ensure the total percentage is 100%
total_percentage = weighted_avg_pres_polls_2023.loc["weighted_avg"].sum()
weighted_avg_pres_polls_2023.loc["weighted_avg"] = (100 / total_percentage) * weighted_avg_pres_polls_2023.loc["weighted_avg"]


In [12]:
weighted_avg_pres_polls_2023

Unnamed: 0,Erdoğan,İnce,Kılıçdaroğlu,Oğan
weighted_avg,46.046905,5.210839,46.519451,2.222805


In [13]:
second_round.head()

Unnamed: 0,Tarih,Anket şirketi,Örneklem,Erdoğan,Kılıçdaroğlu
0,Mayıs ayı ortalaması,Mayıs ayı ortalaması,2573,508,492
1,27 Nisan-2 Mayıs,ASAL,2573,508,492
2,Nisan ayı ortalaması,Nisan ayı ortalaması,40000,4879,5121
3,25-29 Nisan,Optimar[a 1],3005,514,486
4,26-28 Nisan,Aksoy[a 2],1537,458,542


In [14]:
second_round.drop(second_round.columns[[0,1]], axis=1, inplace=True)
second_round.head(), second_round.columns

(   Örneklem Erdoğan Kılıçdaroğlu
 0      2573    50,8         49,2
 1      2573    50,8         49,2
 2     40000   48,79        51,21
 3      3005    51,4         48,6
 4      1537    45,8         54,2,
 Index(['Örneklem', 'Erdoğan', 'Kılıçdaroğlu'], dtype='object'))

In [15]:
for column in second_round.columns:
    if second_round[column].dtype == 'object':  # Check if column contains string values
        second_round[column] = second_round[column].str.replace("+", "").str.replace(",", ".").astype(float)
    else:
        second_round[column] = second_round[column].astype(float)
second_round.head()

  second_round[column] = second_round[column].str.replace("+", "").str.replace(",", ".").astype(float)


Unnamed: 0,Örneklem,Erdoğan,Kılıçdaroğlu
0,2573.0,50.8,49.2
1,2573.0,50.8,49.2
2,40000.0,48.79,51.21
3,3005.0,51.4,48.6
4,1537.0,45.8,54.2


In [16]:
def calculate_weight(row, prev_participants=47239370, threshold=1.5, max_weight=1.5):

    # Decrease the weight for every datapoint
    weight = 0.95 - row.name * 0.001

    # Multiply the weight by a factor that increases as the number of participants increases
    weight *= row["Örneklem"] / pres_polls["Örneklem"].mean()

    # Calculate the ratio of the number of participants in the current poll to the previous election
    ratio = row["Örneklem"] / prev_participants

    # Cap the ratio at the threshold value if it is greater than that value
    if ratio > threshold:
        ratio = threshold

    # Add a term that adjusts the weight based on the ratio of the number of participants in the current poll to the previous election
    weight *= 1 + np.log(ratio + 1) * 0.001

    # Cap the weight adjustment term at a maximum value
    if weight > max_weight:
        weight = max_weight

    return weight


# Apply the calculate_weight function to the index of the DataFrame to calculate the weight of each datapoint
#polls["weight"] = polls.index.map(calculate_weight)
prev_participants = 47239370
# Replace with the number of participants in the previous election

second_round["weight"] = second_round.apply(calculate_weight, args=(prev_participants,), axis=1)

second_round["weight"] = second_round["weight"].astype(float)



In [17]:
weighted_avg_second_round = pd.DataFrame(index=["weighted_avg"])

for candidate in second_round.columns[1:-1]:
    # Exclude rows with missing values or zero values for the current party column
    valid_rows = (second_round[candidate].notnull()) & (second_round[candidate] != 0)
    # Calculate the weighted average for the valid rows only
    if valid_rows.any():
        weighted_avg = (second_round.loc[valid_rows, candidate] * second_round.loc[valid_rows, "weight"]).sum() / second_round.loc[valid_rows, "weight"].sum()
    else:
        # Calculate the average value of the party column across all rows
        avg_value = second_round[candidate].mean()
        # Calculate the average weight of rows where the party column has a value
        avg_weight = second_round.loc[second_round[candidate].notnull(), "weight"].mean()
        # Calculate the half-weighted average for the missing rows
        weighted_avg = 0.5 * (avg_value + (avg_weight * weighted_avg_second_round.loc["weighted_avg", candidate]))
    weighted_avg_second_round.loc["weighted_avg", candidate] = weighted_avg

# Normalize the values to ensure the total percentage is 100%
total_percentage = weighted_avg_second_round.loc["weighted_avg"].sum()
weighted_avg_second_round.loc["weighted_avg"] = (100 / total_percentage) * weighted_avg_second_round.loc["weighted_avg"]


In [18]:
weighted_avg_second_round

Unnamed: 0,Erdoğan,Kılıçdaroğlu
weighted_avg,49.235175,50.764825
