In [81]:
import numpy as np
import pandas as pd

In [82]:
#Vamos criar uma base de dados que armazena nas contas de que ano cada senador aparece.
dfSenators = pd.DataFrame(columns = ['SENATOR', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'],
                         dtype = str)

In [83]:
dfSenadores

Unnamed: 0,SENATOR,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,ZEQUINHA MARINHO,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Ye


In [84]:
#Vamos passar pelas contas de cada ano e armazenar em que ano cada senadro aparece.
Years = ['2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']
for year in Years:
    dfContas = pd.read_csv('../data/CEAPS_Data/despesa_ceaps_' + year + '.csv', skiprows = 0, header = 1, sep = ';', encoding = 'latin-1', decimal = ',')
    
    #Pega os nomes unicos dos senadores presentes nas contas deste ano
    Senadores = dfContas['SENADOR'].unique()
    
    #Verica cada nome se ele já se encontra na base; 
    #se já estiver, apenas marca que ele está presente no respectivo ano.
    #se não estiver, cria registra para ele, e marca que ele esta presente no respectivo ano.
    num_novos_senadores = 0
    for senador in Senadores:
        SenadoresExistentes = dfSenators['SENATOR'].values
        
        if senador in SenadoresExistentes:
            #Senador já foi incluido anteriormente
            dfSenators.loc[dfSenators['SENATOR'] == senador, year] = '1'
        else:
            #Senador apareceu pela primeira vez
            registro = np.full((len(Years)), '0')
            registro[int(year) - 2008] = '1'
            registro = np.concatenate(([senador], registro)).reshape(1,16)
            registro = pd.DataFrame(registro, columns = ['SENATOR', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'])
            
            #adiciona novo senador
            dfSenators = pd.concat([registro, dfSenators], ignore_index = True, axis = 0)
            num_novos_senadores += 1
    print("Total number of senators in " + year + ": ", len(Senadores))
    print("Number of new senators in " + year + ": ", num_novos_senadores)
print("Total number of senators in data: ", dfSenators.shape[0])

Total number of senators in 2008:  85
Number of new senators in 2008:  85
Total number of senators in 2009:  85
Number of new senators in 2009:  7
Total number of senators in 2010:  89
Number of new senators in 2010:  9
Total number of senators in 2011:  117
Number of new senators in 2011:  53
Total number of senators in 2012:  90
Number of new senators in 2012:  9
Total number of senators in 2013:  84
Number of new senators in 2013:  1
Total number of senators in 2014:  88
Number of new senators in 2014:  5
Total number of senators in 2015:  101
Number of new senators in 2015:  29
Total number of senators in 2016:  93
Number of new senators in 2016:  8
Total number of senators in 2017:  88
Number of new senators in 2017:  5
Total number of senators in 2018:  91
Number of new senators in 2018:  7
Total number of senators in 2019:  127
Number of new senators in 2019:  49
Total number of senators in 2020:  84
Number of new senators in 2020:  4
Total number of senators in 2021:  86
Number

In [85]:
#Sort all names in alphabetic order.
dfSenators.sort_values('SENATOR', inplace = True)

In [86]:
pd.set_option('display.max_rows', None) #Show all lines
dfSenators

Unnamed: 0,SENATOR,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
192,ACIR GURGACZ,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1
277,ADA MELLO,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
276,ADELMIR SANTANA,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
71,AIRTON SANDOVAL,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
59,ALESSANDRO VIEIRA,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1
0,ALEXANDRE SILVEIRA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
185,ALFREDO COTAIT,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
184,ALFREDO NASCIMENTO,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0
275,ALMEIDA LIMA,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
274,ALOIZIO MERCADANTE,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0


In [87]:
#Concatenate all data in only one file
first = True
for year in Years:
    dfYear = pd.read_csv('../data/CEAPS_Data/despesa_ceaps_' + year + '.csv', skiprows = 0, header = 1, sep = ';', encoding = 'latin-1', decimal = ',')
    if first == True:
        dfAllData = dfYear
        first = False
    else:
        dfAllData = pd.concat([dfAllData, dfYear], ignore_index = True, axis = 0)

In [92]:
#Data about the year and month interval that we want to predict
target_year = 2018
initial_month = 1
final_month = 12

#Get the names of all senators that appear in target_year
Names = dfSenators.loc[dfSenators[str(target_year)] == '1', 'SENATOR'].values

In [None]:
#Train model to predict total expenditure in target_year between initial_month and final_month
#Only data prior to target_year, between initial_month and final_month, and from the senators in variable Names will be taken into account in training.
dfTrain
for year in Years:
    if year == target_year:
        break
    dfYear = pd.read_csv('../data/CEAPS_Data/despesa_ceaps_' + year + '.csv', skiprows = 0, header = 1, sep = ';', encoding = 'latin-1', decimal = ',')