## Read the files 2017_german_election_overall.csv and 2017_german_election_party.csv from the german-election-2017 dataset.

In [None]:
# Importing Pandas as pd
import pandas as pd

To write the correct path to the dataset, I have to know what is the current directory

In [None]:
!ls

In [None]:
df_overall = pd.read_csv("2017_german_election_overall.csv")

In [None]:
df_party = pd.read_csv("2017_german_election_party.csv")

In [None]:
# See all the attributes
df_overall.columns

In [None]:
# See all the attributes
df_party.columns

In [None]:
# Are there any missing values?
df_overall['invalid_first_votes'].isnull().sum()

In [None]:
df_overall.head()

In [None]:
df_party.head()

## For each area, compute the percentage of the voters over the registered voters

In [None]:
# Aggiungi una nuova colonna di nome 'Voters_Percentages_Areas' dove, riga per riga, si avrà una percentuale.
# E' necessario non fare un ragionamento del tipo: "fai riga per riga una cerca azione" (tipico del ragionamento
# in Python)

df_overall['Voters_Percentages_Areas'] = round((df_overall['total_votes']/df_overall['registered.voters'])*100, 2)

Show only the relevant columns

In [None]:
df_overall[['area_names','state','Voters_Percentages_Areas']]

## For each state, compute the total number of registered voters

Since we do not know (yet) how to group rows, but we know how to select some rows, we can compute the set of all states, then for each state we can compute the total number of voters.

Hence we start by computing the states

In [None]:
# Create a set inside a list of all the unique states

states = list(set(df_overall['state']))
states

Then we compute the overall number of voters for each state (not needed for now, but it will be useful later on)

In [None]:
# We compute the sum of all the pairs in each state
votes = {}

for elem in states:
    votes[elem] = sum(df_overall[df_overall['state'] == elem]['total_votes'])
votes

Then we build a dict `registered` where for each state we compute the number of registered voters

In [None]:
registered = {}

for stato in states:
    # poiché nella set non ho ripetizioni di stati, non mi serve un if
    registered[stato] = sum(df_overall[df_overall['state'] == stato]['registered.voters'])
registered

## How many registered voters are there in Bayern or Saarland (compute the voters in each state and the sum of the two numbers)

Since we already have the relevant data in the `registered` dict, we can simply sum those values 

In [None]:
registered['Bayern'] + registered['Saarland']

Now we will solve again the exercise, without using the dict.
Instead we will select the rows of each state.

First we check if we can select the rows regarding Bayern

In [None]:
df_overall[df_overall['state'] == 'Bayern']

Then we sum the registered voters. Once the overall procedure is completed, we can (and should) delete the previous cell.

In [None]:
df_overall[df_overall['state'] == 'Bayern']['registered.voters'].sum()

Finally we solve the exercise

In [None]:
somma = 0

for stato in ['Bayern', 'Saarland']: 
    somma += df_overall[df_overall['state'] == stato]['registered.voters'].sum()
    
somma

## For each state, compute the number of votes (first vote) for each party

In  this case, we also need the list of parties.

In [None]:
parties = list(set(df_party['party']))

In [None]:
votes_party = {}

for stato in states:
    for party in parties:
        votes_party[(stato, party)] = df_party[(df_party['state'] == stato) & (df_party['party'] == party)]['votes_first_vote'].sum()
        
votes_party

#### Same results either with "set(df_party['party'])" or "unique()":

In [None]:
votes_party = {}

for stato in df_party['state'].unique():
    for party in df_party['party'].unique():
        votes_party[(stato, party)] = df_party[(df_party['state'] == stato) & (df_party['party'] == party)]['votes_first_vote'].sum()
        
votes_party

The votes for each party are stored in the `german_party` dataframe.
Again, we do not know how to group rows, hence I have to select the rows regarding any party and any state.

### For each state and each party, compute the area where the party has taken most total votes

In [None]:
for state in states:
    for party in parties:
        votes = df_party[(df_party['state'] == state) & (df_party['party'] == party)]
        max_votes = max(votes['votes_first_vote'])
        area_votes = votes[votes['votes_first_vote'] == max_votes]['area_name']
        # Print: state, party, max_votes and the areas where that votes
        print(state, party, max_votes, list(area_votes))