# Aluna: Alana Viana
# Tarefa Análise de Votos

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import xgboost as xgb
from scipy import stats
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn import svm, preprocessing
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

%config InlineBackend.figure_format = 'retina' # set 'png' here when working on notebook
%matplotlib inline

In [None]:
df_2006_2010 = pd.read_csv("data/eleicoes_2006_a_2010.csv", index_col = "nome")
df_2014 = pd.read_csv("data/eleicoes_2014.csv", index_col = "nome")
all_data = pd.concat([df_2006_2010, df_2014])
#all_data = all_data.drop(columns=["ano"])
all_data = all_data.drop(columns=["sequencial_candidato"])
#all_data = all_data.drop(columns=["nome"])

## Pré-Processamento

(tratamento de valores ausentes, variáveis categóricas e normalização, quando for o caso).

#### Normalização 

In [None]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

#### Dummies e NA

In [None]:
# aplicando nos dados de 2006 e 2010
all_data = pd.get_dummies(all_data) # dummies
all_data = all_data.fillna(all_data.mean()) # valores ausentes

In [None]:
X_train_2006 = all_data[(all_data["ano"]==2006)]
y_train_2006 = X_train_2006["votos"]
X_train_2006 = X_train_2006.drop(columns=["votos"])
X_train_2006 = X_train_2006.drop(columns=["ano"])

X_test_2010 = all_data[(all_data["ano"]==2010)]
y_test_2010 = X_test_2010["votos"]
X_test_2010 = X_test_2010.drop(columns=["votos"])
X_test_2010 = X_test_2010.drop(columns=["ano"])

X_train_2006_2010 = all_data[(all_data["ano"]!=2014)]
y_train_2006_2010 = X_train_2006_2010["votos"]
X_train_2006_2010 = X_train_2006_2010.drop(columns=["votos"])
X_train_2006_2010 = X_train_2006_2010.drop(columns=["ano"])

X_test_2014 = all_data[(all_data["ano"]==2014)]
y_test_2014 = X_test_2014["votos"]
X_test_2014 = X_test_2014.drop(columns=["votos"])
X_test_2014 = X_test_2014.drop(columns=["ano"])

###  Variáveis e métodos auxiliares

In [None]:
def rmse_cv(model, X, y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring = "neg_mean_squared_error", cv = 5 ))
    return (rmse)

In [None]:
df_resultado = pd.DataFrame(columns=["modelo", "r2_treino", "r2_teste", "rmse_cross_treino", "rmse_cross_treino_std", "rmse_cross_teste", "rmse_cross_teste_std"] )

## Modelos de ML 

Modelos treinados com dados de 2006 e Test com dados de 2014 e comparados entre si

### 1 Modelo: Ridge

In [None]:
alphas = [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 60, 65] # The higher the regularization the less prone our model will be to overfit
cv_rmse_ridge = [rmse_cv(Ridge(alpha = alpha), X_train_2006, y_train_2006).mean() 
            for alpha in alphas]
cv_rmse_ridge = pd.Series(cv_rmse_ridge, index = alphas)

In [None]:
cv_ridge = pd.Series(cv_rmse_ridge, index = alphas)
cv_ridge.plot(title = "Validation - rmse x alpha")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
print("rmse Ridge: {}".format(cv_ridge.min()))

In [None]:
# alpha com o menor rmse
print("alpha com menor RMSE: {}".format(cv_ridge.index[[cv_ridge.values == cv_ridge.min()]]))

In [None]:
model_ridge = Ridge(alpha=10)
model_ridge.fit(X_train_2006, y_train_2006)

In [None]:
y_pred_ridge = model_ridge.predict(X_test_2010)

In [None]:
r2_treino = model_ridge.score(X_train_2006, y_train_2006)
print("Ridge - R2- Treino: {}".format(r2_treino))

In [None]:
r2_teste = model_ridge.score(X_test_2010, y_test_2010)
print("Ridge - R2 - Teste: {}".format(r2_teste))

In [None]:
rmse_cross_treino = -cross_val_score(model_ridge, X_train_2006, y_train_2006, scoring = "neg_mean_squared_error", cv = 5 )
print("Ridge - RMSE - Cross validation - no treino: {} ".format(rmse_cross_treino.mean()))

In [None]:
rmse_cross_teste = -cross_val_score(model_ridge, X_test_2010, y_test_2010, scoring = "neg_mean_squared_error", cv = 5 )
print("Ridge - RMSE - Cross validation - no teste: {} ".format(rmse_cross_teste.mean()))

In [None]:
df_resultado = df_resultado.append({"modelo":"ridge", "r2_treino": r2_treino,"r2_teste": r2_teste, "rmse_cross_treino": rmse_cross_treino.mean(), "rmse_cross_treino_std":rmse_cross_treino.std(), "rmse_cross_teste":rmse_cross_teste.mean(), "rmse_cross_teste_std":rmse_cross_teste.std()}, ignore_index=True)

In [None]:
df_resultado

#### Ridge Coeficientes 

In [None]:
coef_ridge = pd.Series(model_ridge.coef_, index = X_train_2006.columns)
print("Ridge picked " + str(sum(coef_ridge != 0)) + " variables and eliminated the other " +  str(sum(coef_ridge == 0)) + " variables")

In [None]:
imp_coef_ridge = pd.concat([coef_ridge.sort_values().head(10),
                     coef_ridge.sort_values().tail(10)])

matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef_ridge.plot(kind = "barh")
plt.title("Coefficients in the Ridge Model")

In [None]:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":model_ridge.predict(X_train_2006), "true": y_train_2006})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals", kind = "scatter", title = "Ridge - residuals x preds")

### 2 Modelo: Lassso 

In [None]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005], max_iter=60000000).fit(X_train_2006, y_train_2006)

In [None]:
r2_treino_lasso = model_lasso.score(X_train_2006, y_train_2006)
print("Lasso - R2 - treino: {}".format(r2_treino_lasso))

In [None]:
r2_teste_lasso = model_lasso.score(X_test_2010, y_test_2010)
print("Lasso - R2 - Teste: {}".format(r2_teste_lasso))

In [None]:
rmse_cross_treino_lasso = -cross_val_score(model_lasso, X_train_2006, y_train_2006, scoring = "neg_mean_squared_error", cv = 5 )
print("Lasso - RMSE - Cross validation - Treino: {} ".format(rmse_cross_treino_lasso.mean()))

In [None]:
rmse_cross_teste_lasso = -cross_val_score(model_lasso, X_test_2010, y_test_2010, scoring = "neg_mean_squared_error", cv = 5 )
print("Lasso - RMSE - Cross validation - Treino: {} ".format(rmse_cross_teste_lasso.mean()))

In [None]:
df_resultado = df_resultado.append({"modelo":"lasso", "r2_treino": r2_treino_lasso, "r2_teste": r2_teste_lasso, "rmse_cross_treino": rmse_cross_treino_lasso.mean(), "rmse_cross_treino_std":rmse_cross_treino_lasso.std(), "rmse_cross_teste":rmse_cross_teste_lasso.mean(), "rmse_cross_teste_std":rmse_cross_teste.std() }, ignore_index=True)

In [None]:
df_resultado

#### Lasso coeficient

In [None]:
coef_lasso = pd.Series(model_lasso.coef_, index = X_train_2006.columns)

In [None]:
print("Lasso picked " + str(sum(coef_lasso != 0)) + " variables and eliminated the other " + str(sum(coef_lasso == 0)) + " variables")

In [None]:
imp_coef_lasso = pd.concat([coef_lasso.sort_values().head(10), coef_lasso.sort_values().tail(10)])

In [None]:
matplotlib.rcParams['figure.figsize'] = (6.0, 8.0)
imp_coef_lasso.plot(kind = "barh")
plt.title("Coefficients in the Ridge Model")

In [None]:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":model_lasso.predict(X_train_2006), "true": y_train_2006})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals", kind = "scatter", title = "Ridge - residuals x preds")

### 3 Modelo: SVR

In [None]:
# comentei porque demora um pouco para executar

#rmse_lista = []
#c_lista = []
#for i in range(1,3):
#    model_svr = svm.SVR(kernel='rbf', C = i)
#    model_svr.fit(X_train_2006, y_train_2006)
#    score = model_svr.score(X_train_2006, y_train_2006)
#    rmse_lista.append(model_svr.score(X_train_2006, y_train_2006))
#    c_lista.append(i)
#    
#s = pd.Series(rmse_lista, index = c_lista)

In [None]:
#plt.xlabel("Parâmetro C")
#plt.ylabel("RMSE")
#plt.title("RMSE x Parametro C")
#plt.plot(s.index, s.values)

O parâmetro C = 1 foi o que teve o menor RMSE no treino

In [None]:
#def testar_kernels(kernels):
#    for kernel in kernels:
#        classifier = svm.SVR(kernel=kernel, C=1)
#        cv_svr = rmse_cv(classifier, X_train_2006, y_train_2006).mean()
#        print("SVR RMSE CROSS: {}".format(cv_svr))
        
#['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
#kernels = ['poly', 'rbf', 'sigmoid'] # o tempo de processamento do kernel linear é alto em relação aos demais
#testar_kernels(kernels)

O kernel rbf foi o que teve o menor RMSE no treino e o parâmetro c = 1

In [None]:
model_svr = svm.SVR(kernel='rbf', C = 1)
model_svr.fit(X_train_2006, y_train_2006)

In [None]:
r2_treino_svr = model_svr.score(X_train_2006, y_train_2006)
print("SVR - R2 - Treino: {}".format(r2_treino_svr)) 

In [None]:
r2_teste_svr = model_svr.score(X_test_2010, y_test_2010)
print("SVR - R2 - Teste: {}".format(r2_teste_svr)) 

In [None]:
rmse_cross_treino_svr = -cross_val_score(model_svr, X_train_2006, y_train_2006, scoring = "neg_mean_squared_error", cv = 5 )
print("SVR - RMSE - Cross validation - Treino: {} ".format(rmse_cross_treino_svr.mean()))

In [None]:
rmse_cross_teste_svr = -cross_val_score(model_svr, X_test_2010, y_test_2010, scoring = "neg_mean_squared_error", cv = 5 )
print("SVR - RMSE - Cross validation - Treino: {} ".format(rmse_cross_teste_svr.mean()))

In [None]:
df_resultado = df_resultado.append({"modelo":"svr", "r2_treino": r2_treino_svr, "r2_teste": r2_teste_svr, "rmse_cross_treino": rmse_cross_treino_svr.mean(), "rmse_cross_treino_std":rmse_cross_treino_svr.std(), "rmse_cross_teste":rmse_cross_teste_svr.mean(), "rmse_cross_teste_std":rmse_cross_teste_svr.std()}, ignore_index=True)

In [None]:
df_resultado

In [None]:
y_pred_svr = model_svr.predict(X_test_2010)

In [None]:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":model_svr.predict(X_train_2006), "true": y_train_2006})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals", kind = "scatter", title="SVR - Residuals x preds")

### 4 Modelo: RandomForestRegressor

In [None]:
model_random = RandomForestRegressor(n_estimators = 50, random_state = 43, max_depth = 8)
model_random.fit(X_train_2006, y_train_2006)

In [None]:
r2_treino_random = model_random.score(X_train_2006, y_train_2006)
print("RandoForest - R2 - Treino: {}".format(r2_treino_random))

In [None]:
r2_teste_random = model_random.score(X_test_2010, y_test_2010)
print("RandoForest - RMSE - Teste: {}".format(r2_teste_random))

In [None]:
rmse_cross_treino_random = -cross_val_score(model_random, X_train_2006, y_train_2006, scoring = "neg_mean_squared_error", cv = 5 )
print("RandoForest - RMSE - Cross - Treino: {}".format(rmse_cross_treino_random.mean()))

In [None]:
rmse_cross_teste_random = -cross_val_score(model_random, X_test_2010, y_test_2010, scoring = "neg_mean_squared_error", cv = 5 )
print("RandoForest - RMSE - Cross - Teste: {}".format(rmse_cross_teste_random.mean()))

In [None]:
df_resultado = df_resultado.append({"modelo":"random", "r2_treino": r2_treino_random, "r2_teste": r2_teste_random, "rmse_cross_treino": rmse_cross_treino_random.mean(), "rmse_cross_treino_std":rmse_cross_treino_random.std(), "rmse_cross_teste":rmse_cross_teste_random.mean(), "rmse_cross_teste_std":rmse_cross_teste_random.std()}, ignore_index=True)

In [None]:
df_resultado

In [None]:
y_pred_random = model_random.predict(X_test_2010) 

In [None]:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":model_random.predict(X_train_2006), "true": y_train_2006})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals", kind = "scatter")

### 5 Modelo: KNN 

In [None]:
model_knn = KNeighborsRegressor(n_neighbors=5)  
model_knn.fit(X_train_2006, y_train_2006)  

In [None]:
r2_treino_knn = model_knn.score(X_train_2006, y_train_2006)
print("KNN - R2 - Treino: {} ".format(r2_treino_knn))

In [None]:
r2_teste_knn = model_knn.score(X_test_2010, y_test_2010)
print("KNN - R2 - Teste: {} ".format(r2_teste_knn))

In [None]:
rmse_cross_treino_knn = -cross_val_score(model_knn, X_train_2006, y_train_2006, scoring = "neg_mean_squared_error", cv = 5 )
print("KNN - RSME - Cross - Treino: {} ".format(rmse_cross_treino_knn.mean()))

In [None]:
rmse_cross_teste_knn = -cross_val_score(model_knn, X_test_2010, y_test_2010, scoring = "neg_mean_squared_error", cv = 5 )
print("KNN - RSME - Cross - Teste: {} ".format(rmse_cross_teste_knn.mean()))

In [None]:
df_resultado = df_resultado.append({"modelo":"knn", "r2_treino": r2_treino_knn, "r2_teste": r2_teste_knn, "rmse_cross_treino": rmse_cross_treino_knn.mean(), "rmse_cross_treino_std":rmse_cross_treino_knn.std(), "rmse_cross_teste":rmse_cross_teste_knn.mean(), "rmse_cross_teste_std":rmse_cross_teste_knn.std()}, ignore_index=True)

In [None]:
df_resultado

In [None]:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":model_knn.predict(X_train_2006), "true": y_train_2006})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals", kind = "scatter")

## Analise dos Resultados

In [None]:
df_resultado

Foram avaliados, 5 modelos: Ridge, Lasso, SVR, RandomForest, KNN. Os melhores modelos analisando o rmse de treino e teste são os SVR (0.90) o random (0.89) no treino e para o teste SVR (0.96) e random (0.95)

# Re-treino do Modelo SVR com todos os dados (2006 e 2010) e validação com 2014

In [None]:
model_svr_2014 = svm.SVR(kernel='rbf', C = 1)
model_svr_2014.fit(X_train_2006_2010, y_train_2006_2010)

In [None]:
r2_treino_svr_2014 = model_svr_2014.score(X_train_2006_2010, y_train_2006_2010)
print("R2 - SVR - Treino: {}".format(r2_treino_svr_2014))

In [None]:
r2_test_svr_2014 = model_svr_2014.score(X_test_2014, y_test_2014)
print("R2 - SVR - Teste: {}".format(r2_test_svr_2014))

In [None]:
rmse_cross_treino_svr_2014 = -cross_val_score(model_svr_2014, X_train_2006_2010, y_train_2006_2010, scoring = "neg_mean_squared_error", cv = 5 )
print("SVR - RMSE - Cross validation - Treino: {} ".format(rmse_cross_treino_svr_2014.mean()))

In [None]:
rmse_cross_teste_svr_2014 = -cross_val_score(model_svr_2014, X_test_2014, y_test_2014, scoring = "neg_mean_squared_error", cv = 5 )
print("KNN - RSME - Cross - Teste: {} ".format(rmse_cross_teste_svr_2014.mean()))

In [None]:
df_resultado = df_resultado.append({"modelo":"svr_2014", "r2_treino": rmse_treino_svr, "r2_teste": rmse_teste_svr, "rmse_cross_treino": rmse_cross_treino_svr.mean(), "rmse_cross_treino_std":rmse_cross_treino_svr.std(), "rmse_cross_teste":rmse_cross_teste_svr.mean(), "rmse_cross_teste_std":rmse_cross_teste_svr.std()}, ignore_index=True)

In [None]:
df_resultado

In [None]:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)
preds = pd.DataFrame({"preds":model_svr.predict(X_train_2006_2010), "true": y_train_2006_2010})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals", kind = "scatter")

In [None]:
y_pred_svr_model = model_svr.predict(X_test_2014)

In [None]:
solution = pd.DataFrame({"nome": X_test_2014.index,"votos preditos":np.expm1(y_pred_svr_model).astype(int)})
solution.to_csv("pred_2014.csv")

In [None]:
solution = pd.DataFrame({"nome": X_test_2014.index,"votos":np.expm1(y_test_2014).astype(int)})
solution.to_csv("real_2014.csv")

In [None]:
solution = pd.DataFrame({"nome": y_test_2014.index,"votos preditos":np.expm1(y_pred_svr_model).astype(int),"votos real":np.expm1(y_test_2014).astype(int)})
solution.to_csv("pred_e_real_2014.csv")