<a href="https://colab.research.google.com/github/bmnds/uea-data-science-03-statistics/blob/main/%5BEstat%C3%ADstica04%5D_Trabalho_Final_Bruno.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Trabalho Final da Disciplina de Estatística do Curso de Pós-Graduação em Ciência de Dados da UEA-AM
**Grupo:** Alberto, Bruno, Lelson e Levi

## O Trabalho
* Definir um tema e os objetivos a serem analisados 
* Escolher um dataframe com no mínimo cinco campos e 25 linhas, dos quais pelo menos dois devem ser numéricos
* Realizar um levantamento amostral atendendo a todos os critérios de avaliação

## A Avaliação
1. [ ] Construir tabelas e gráficos apropriados para cada variável do data frame
2. [ ] Calcular a média, mediana, moda, desvio-padrão, coeficiente de variação
e simetria das variáveis quantitativas
3. [ ] Analisar a normalidade dos dados numéricos e se existe a presença de
outliers
4. [ ] Comparar as variáveis quantitativas e descrever qual é mais homogênea
5. [ ] Calcular os respectivos Intervalos de Confiança ao nível de 95% para cada
campo do data frame
6. [ ] Comparar a média de um campo numérico em relação a um campo
categórico e responder se existe diferença estatística ao nível de 5% de significância
7. [ ] Realizar um cruzamento de dois campos categóricos e responder se existe
diferença estatística ao nível de 5% de significância.


In [1]:
# Pacotes necessários
from six.moves import urllib
from scipy import stats
from scipy.stats import binom, nbinom, poisson, uniform, expon, norm 
import pandas as pd
import statsmodels.stats.proportion as smp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sea
import statistics as st

  import pandas.util.testing as tm


In [2]:
# Mudança da semente aleatória randômica para manter os resultados em várias execuções
np.random.seed(20201109)

# Dataset das Eleições Presidenciais dos Estados Unidos 2020
Fontes dos dados:
* https://www.kaggle.com/daithibhard/us-electoral-college-votes-per-state-17882020
* https://www.kaggle.com/unanimad/us-election-2020

Explicação sobre o funcionamento do processo eleitoral dos EUA:
* https://www.usa.gov/election

Sugestões de análises:
* https://www.nbcnews.com/politics/2020-elections/president-results
* https://www.kaggle.com/amitkumarmanjhi/us-election-2020-result-analysis

In [3]:
# Download do dataset
urllib.request.urlretrieve('https://drive.google.com/uc?export=download&id=1I4Hvc3wEsYuRNkinkAyY7LqcpVOLF9-O','us_elections.tar.gz')
!tar -zxvf us_elections.tar.gz
!rm us_elections.tar.gz

us_elections/
us_elections/Electoral_College.csv
us_elections/fontes.txt
us_elections/governors_county.csv
us_elections/governors_county_candidate.csv
us_elections/governors_state.csv
us_elections/house_county.csv
us_elections/house_county_candidate.csv
us_elections/house_state.csv
us_elections/president_county.csv
us_elections/president_county_candidate.csv
us_elections/president_state.csv
us_elections/senate_county.csv
us_elections/senate_county_candidate.csv
us_elections/senate_state.csv


# Bruno

In [4]:
# Nos EUA, cada Estado possui um 'Peso' na votação representado pela quantidade de 'Ellectors' do colégio eleitoral
db_electoral = pd.read_csv('us_elections/Electoral_College.csv')
db_electoral.columns = ['year', 'state', 'electoral_votes']
db_electoral
# Define a representatividade de cada Estado em percentual
#db_electoral['Pct'] = (100* db_electoral.Votes / db_electoral.groupby('Year').Votes.transform(sum))

Unnamed: 0,year,state,electoral_votes
0,1788,Alabama,
1,1792,Alabama,
2,1796,Alabama,
3,1800,Alabama,
4,1804,Alabama,
...,...,...,...
3004,2004,Wyoming,3.0
3005,2008,Wyoming,3.0
3006,2012,Wyoming,3.0
3007,2016,Wyoming,3.0


In [5]:
# Padroniza o nome do Distrito de Columbia
db_electoral.loc[471,'state'] = 'District of Columbia'
# O nosso interesse é nos valores de 2020
db_electoral_2020 = db_electoral[db_electoral.year.eq(2020)].sort_values('state').reset_index(drop=True)
db_electoral_2020.head()

Unnamed: 0,year,state,electoral_votes
0,2020,Alabama,9.0
1,2020,Alaska,3.0
2,2020,Arizona,11.0
3,2020,Arkansas,6.0
4,2020,California,55.0


In [6]:
db_president_county_candidate = pd.read_csv('us_elections/president_county_candidate.csv')
db_president_county_candidate

Unnamed: 0,state,county,candidate,party,votes
0,Delaware,Kent County,Joe Biden,DEM,44518
1,Delaware,Kent County,Donald Trump,REP,40976
2,Delaware,Kent County,Jo Jorgensen,LIB,1044
3,Delaware,Kent County,Howie Hawkins,GRN,420
4,Delaware,Kent County,Write-ins,WRI,0
...,...,...,...,...,...
31162,Arizona,Maricopa County,Donald Trump,REP,980494
31163,Arizona,Maricopa County,Jo Jorgensen,LIB,31069
31164,Arizona,Mohave County,Donald Trump,REP,77724
31165,Arizona,Mohave County,Joe Biden,DEM,24687


In [7]:
db_president_state = pd.read_csv('us_elections/president_state.csv')
del db_president_state['electoral_vote']
db_president_state.columns = ['state', 'total_votes']
db_president_state.head()

Unnamed: 0,state,total_votes
0,Delaware,502384
1,District of Columbia,279152
2,Florida,11075706
3,Georgia,4983735
4,Hawaii,573854


In [8]:
db_state_electoral = pd.merge(db_president_state, db_electoral_2020, how='left', on='state', )
del db_state_electoral['year']
db_state_electoral.head()

Unnamed: 0,state,total_votes,electoral_votes
0,Delaware,502384,3.0
1,District of Columbia,279152,3.0
2,Florida,11075706,29.0
3,Georgia,4983735,16.0
4,Hawaii,573854,4.0


In [16]:
# define o candidato mais votado de cada 'state'
db_top_voted_president_state = db_president_county_candidate.groupby(['state', 'candidate', 'party']).sum('votes').sort_values(by = ['state', 'votes'], ascending = [True, False]).groupby('state').head(1).reset_index()
db_top_voted_president_state.head()

Unnamed: 0,state,candidate,party,votes
0,Alabama,Donald Trump,REP,1434159
1,Alaska,Donald Trump,REP,80999
2,Arizona,Joe Biden,DEM,1643664
3,Arkansas,Donald Trump,REP,761251
4,California,Joe Biden,DEM,9315259


In [17]:
# merge com electoral votes 2020
db_top_voted = pd.merge(db_top_voted_president_state, db_state_electoral, how='left', on='state', )
db_top_voted['majority'] = db_top_voted.votes / db_top_voted.total_votes
db_top_voted.head()

Unnamed: 0,state,candidate,party,votes,total_votes,electoral_votes,majority
0,Alabama,Donald Trump,REP,1434159,2309900,9.0,0.620875
1,Alaska,Donald Trump,REP,80999,172031,3.0,0.47084
2,Arizona,Joe Biden,DEM,1643664,3322535,11.0,0.494702
3,Arkansas,Donald Trump,REP,761251,1216818,6.0,0.625608
4,California,Joe Biden,DEM,9315259,14414296,55.0,0.646251


In [22]:
db_president_results = db_top_voted[db_top_voted.majority.ge(0.501)].groupby(['candidate', 'party']).sum('electoral_votes').sort_values(by = ['electoral_votes'], ascending = [False]).reset_index()
del db_president_results['total_votes']
del db_president_results['majority']
db_president_results

Unnamed: 0,candidate,party,votes,electoral_votes
0,Joe Biden,DEM,39434043,249.0
1,Donald Trump,REP,32869168,214.0


In [None]:
db_top_voted[db_top_voted.majority.gt(0.5)].sum()
db_top_voted[db_top_voted.majority.le(0.5)]

In [None]:
db_president_county_candidate.groupby('state').sum('votes')