<a href="https://colab.research.google.com/github/andraroman12/PMP-2024/blob/main/Examen/ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import pymc as pm
import arviz as az
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from google.colab import files


uploaded = files.upload()

file_name = list(uploaded.keys())[0]
data = pd.read_csv(file_name)


data['Varsta'] = (data['Varsta'] - data['Varsta'].mean()) / data['Varsta'].std()
data['Venit'] = (data['Venit'] - data['Venit'].mean()) / data['Venit'].std()

X = data[['Varsta', 'Sex', 'Educatie', 'Venit']]
y = data['Vot']

# Subpunctul a: Definirea modelului complet pentru toate variabilele independente
with pm.Model() as full_model:
    beta = pm.Normal("beta", mu=0, sigma=10, shape=X.shape[1])
    intercept = pm.Normal("intercept", mu=0, sigma=10)

    logits = pm.math.dot(X, beta) + intercept
    p = pm.Deterministic("p", pm.math.sigmoid(logits))

    y_obs = pm.Bernoulli("y_obs", p=p, observed=y)

    full_trace = pm.sample(2000, tune=1000, return_inferencedata=True)

summary_full = az.summary(full_trace, var_names=["beta"], hdi_prob=0.94)
print("Rezumatul coeficienților pentru modelul complet:\n", summary_full)

# Subpunctul b: Identificarea celor mai influente două variabile
coefficients = summary_full["mean"]
most_influential_indices = coefficients.abs().nlargest(2).index.tolist()
most_influential_vars = [X.columns[int(idx.split("_")[-1])] for idx in most_influential_indices]
print("\nCele mai influente două variabile sunt:", most_influential_vars)

X_reduced = data[most_influential_vars]

# Subpunctul c: Construirea modelului redus cu cele mai influente două variabile
with pm.Model() as reduced_model:
    beta = pm.Normal("beta", mu=0, sigma=10, shape=X_reduced.shape[1])
    intercept = pm.Normal("intercept", mu=0, sigma=10)

    logits = pm.math.dot(X_reduced, beta) + intercept
    p = pm.Deterministic("p", pm.math.sigmoid(logits))

    y_obs = pm.Bernoulli("y_obs", p=p, observed=y)

    reduced_trace = pm.sample(2000, tune=1000, return_inferencedata=True)

summary_reduced = az.summary(reduced_trace, var_names=["beta"], hdi_prob=0.94)
print("\nRezumatul coeficienților pentru modelul redus:\n", summary_reduced)

# Graficul graniței de decizie
x1 = np.linspace(X_reduced.iloc[:, 0].min(), X_reduced.iloc[:, 0].max(), 100)
x2 = np.linspace(X_reduced.iloc[:, 1].min(), X_reduced.iloc[:, 1].max(), 100)
xx1, xx2 = np.meshgrid(x1, x2)
logits_grid = (
    summary_reduced.loc["beta[0]", "mean"] * xx1
    + summary_reduced.loc["beta[1]", "mean"] * xx2
    + summary_reduced.loc["intercept", "mean"]
)
p_grid = 1 / (1 + np.exp(-logits_grid))

plt.figure(figsize=(10, 6))
plt.contourf(xx1, xx2, p_grid, levels=[0, 0.5, 1], alpha=0.3, colors=["blue", "red"])
plt.scatter(X_reduced.iloc[:, 0], X_reduced.iloc[:, 1], c=y, cmap="bwr", edgecolor="k")
plt.title("Granița de decizie și regiunea HDI (94%)")
plt.xlabel(most_influential_vars[0])
plt.ylabel(most_influential_vars[1])
plt.colorbar(label="Probabilitatea de a vota candidatul A")
plt.grid()
plt.show()

# Subpunctul d: Compararea modelelor folosind WAIC și LOO
waic_full = az.waic(full_trace)
waic_reduced = az.waic(reduced_trace)

loo_full = az.loo(full_trace)
loo_reduced = az.loo(reduced_trace)

print("\nCompararea modelelor folosind WAIC:")
print(f"Model complet: WAIC = {waic_full.waic:.2f}")
print(f"Model redus: WAIC = {waic_reduced.waic:.2f}")

print("\nCompararea modelelor folosind LOO:")
print(f"Model complet: LOO = {loo_full.loo:.2f}")
print(f"Model redus: LOO = {loo_reduced.loo:.2f}")

if waic_full.waic < waic_reduced.waic:
    print("\nConform WAIC, modelul complet este mai bun.")
else:
    print("\nConform WAIC, modelul redus este mai bun.")

if loo_full.loo < loo_reduced.loo:
    print("Conform LOO, modelul complet este mai bun.")
else:
    print("Conform LOO, modelul redus este mai bun.")


Te rog să încarci fișierul 'date_alegeri_turul2.csv'.


Saving date_alegeri_turul2.csv to date_alegeri_turul2 (5).csv


Output()

Output()

Rezumatul coeficienților pentru modelul complet:
           mean     sd  hdi_3%  hdi_97%  mcse_mean  mcse_sd  ess_bulk  \
beta[0]  0.290  0.128   0.047    0.525      0.002    0.001    4534.0   
beta[1] -0.844  0.250  -1.310   -0.371      0.004    0.003    3628.0   
beta[2]  0.615  0.155   0.327    0.903      0.002    0.002    4109.0   
beta[3]  1.004  0.141   0.747    1.275      0.002    0.001    5191.0   

         ess_tail  r_hat  
beta[0]    2838.0    1.0  
beta[1]    3154.0    1.0  
beta[2]    3004.0    1.0  
beta[3]    3342.0    1.0  


ValueError: invalid literal for int() with base 10: 'beta[3]'