# CEDE Preprocessing
The data from CEDE is collected and filtered so that it can be joined to the main table.

### TO DO: 
* on final table rename columns so they make sense

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read data 
agriculture=pd.read_stata('RAW DATA\PANEL_AGRICULTURA_Y_TIERRA(2023).dta')
violence=pd.read_stata('RAW DATA\PANEL_CONFLICTO_Y_VIOLENCIA(2022).dta')
chars=pd.read_stata('RAW DATA\PANEL_CARACTERISTICAS_GENERALES(2022).dta')
gov=pd.read_stata('RAW DATA\PANEL_BUEN_GOBIERNO(2023).dta')

## 1. Cofee cultivation & production
* Only coverage 2007-2022
* Can one assume NaN at this level is 0?

In [3]:
# keep only relevant columns 
coffee=agriculture[['codmpio','ano','ac_cafe','p_cafe']]

# drop observations with no information on municipality or year 
coffee=coffee.dropna(subset=['codmpio','ano'])


In [4]:
# check how many observations there are each year
coffee.groupby(by='ano').count()

Unnamed: 0_level_0,codmpio,ac_cafe,p_cafe
ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2003,1106,0,0
2004,1106,0,0
2005,1106,0,0
2006,1106,0,0
2007,1110,591,591
2008,1110,598,598
2009,1110,610,610
2010,1107,618,618
2011,1107,625,625
2012,1109,600,600


In [5]:
# since no observation before 2007, drop those years 
coffee=coffee[coffee['ano']>2006]
# fill rest of missings with 0
    # TO DO: Check if this makes sense 
coffee=coffee.fillna(0)

In [6]:
coffee.head(5)

Unnamed: 0,codmpio,ano,ac_cafe,p_cafe
4,5001.0,2007,1027.0,821.599976
5,5001.0,2008,1027.0,821.599976
6,5001.0,2009,1027.0,800.0
7,5001.0,2010,1078.0,862.400024
8,5001.0,2011,1058.0,846.400024


## 2. Coca cultivation
### TO DO:
* Validate with my data + with dube & Vargas so one get as much coverage as possible
* Find out what is each variable --> For now H_coca

In [7]:
coca=violence[['codmpio','ano','coca','H_Coca_menor3','H_Coca_mayor3',
          'H_coca','lotes_coca','P_Coca_menor3','P_Coca_mayor3']]

In [8]:
# check how many observations there are each year
coca.groupby(by='ano').count()

Unnamed: 0_level_0,codmpio,coca,H_Coca_menor3,H_Coca_mayor3,H_coca,lotes_coca,P_Coca_menor3,P_Coca_mayor3
ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1993.0,1124,0,0,0,0,0,0,0
1994.0,1123,0,0,0,0,0,0,0
1995.0,1125,0,0,0,0,0,0,0
1996.0,1124,0,0,0,0,0,0,0
1997.0,1124,0,0,0,0,0,0,0
1998.0,1123,0,0,0,0,0,0,0
1999.0,1124,91,0,0,91,0,0,0
2000.0,1124,178,0,0,178,191,0,0
2001.0,1125,164,160,134,164,166,160,134
2002.0,1125,168,165,125,168,169,165,125


In [9]:
# drop observations with no information on municipality or year 
coca=coca.dropna(subset=['codmpio','ano'])

# keep only relevant columns 
    # TO DO: See if col choice makes sense 
coca=coca[['codmpio','ano','H_coca']]
# fill rest of missings with 0
    # TO DO: Check if this makes sense 
coca=coca.fillna(0)


In [10]:
coca.head(5)

Unnamed: 0,codmpio,ano,H_coca
0,5107.0,2021.0,363.09
1,5120.0,2021.0,1869.62
2,5134.0,2021.0,326.09
3,5234.0,2021.0,0.0
4,5250.0,2021.0,1152.82


## 3. Population 
This information was inputed since there is not registration system as in germany.

In [11]:
# keep only relevant columns 
pop=chars[['codmpio','ano','pobl_tot']]

# drop observations with no information on municipality or year 
pop=pop.dropna(subset=['codmpio','ano'])


In [12]:
# check how many observations there are each year
pop.groupby(by='ano').count()
    #all seem to be complete 

Unnamed: 0_level_0,codmpio,pobl_tot
ano,Unnamed: 1_level_1,Unnamed: 2_level_1
1993.0,1122,1122
1994.0,1122,1122
1995.0,1122,1122
1996.0,1122,1122
1997.0,1122,1122
1998.0,1122,1122
1999.0,1122,1122
2000.0,1122,1122
2001.0,1122,1122
2002.0,1122,1122


In [13]:
pop.replace(0, np.nan, inplace=True)

In [14]:
pop[pop['pobl_tot'].isna()]

Unnamed: 0,codmpio,ano,pobl_tot
1856,5390.0,1993.0,
1857,5390.0,1994.0,
1858,5390.0,1995.0,
1859,5390.0,1996.0,
4379,13030.0,1993.0,
...,...,...,...
31755,91536.0,1993.0,
31756,91536.0,1994.0,
31757,91536.0,1995.0,
31758,91536.0,1996.0,


In [15]:
# compute log 
    #add 1 to avoid going to infinite
pop['lpop']=np.log((pop['pobl_tot']+1)/1000000)

In [16]:
pop.head(5)

Unnamed: 0,codmpio,ano,pobl_tot,lpop
0,5001.0,1993.0,1793491.0,0.584165
1,5001.0,1994.0,1832197.0,0.605516
2,5001.0,1995.0,1872241.0,0.627137
3,5001.0,1996.0,1912860.0,0.6486
4,5001.0,1997.0,1953293.0,0.669517


## 4. Capital Revenue 
Unclear if this is helpful. Not helpful in the we use TerriData. 

### TO DO: 
* Get documentation of CEDE

In [17]:
# keep only relevant columns 
gov=gov[['codmpio','ano','y_cap_regalias','IGA_regalias']]

# drop observations with no information on municipality or year 
gov=gov.dropna(subset=['codmpio','ano'])

In [18]:
# check how many observations there are each year
gov.groupby(by='ano').count()
    # y cap regalias seems better but still unclear if this is what we need 

Unnamed: 0_level_0,codmpio,y_cap_regalias,IGA_regalias
ano,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1984.0,1098,1098,0
1985.0,1098,1098,0
1986.0,1098,1098,0
1987.0,1098,1098,0
1988.0,1098,1098,0
1989.0,1098,1098,0
1990.0,1098,1098,0
1991.0,1098,1098,0
1992.0,1098,1098,0
1993.0,1122,1098,0


In [19]:
# get column I decided
    # TBD if it makes sense 
gov=gov[['codmpio','ano','y_cap_regalias']]
# fill rest of missings with 0
    # TO DO: Check if this makes sense 
gov=gov.fillna(0)

In [20]:
gov.head(5)

Unnamed: 0,codmpio,ano,y_cap_regalias
0,5001.0,1984.0,0.0
1,5001.0,1985.0,0.0
2,5001.0,1986.0,0.0
3,5001.0,1987.0,0.0
4,5001.0,1988.0,0.0


# JOIN TABLES

In [21]:
cede=coffee.merge(coca, how='outer', on=['codmpio','ano'],validate='1:1')
cede=cede.merge(pop, how='outer', on=['codmpio','ano'],validate='1:1')
cede=cede.merge(gov, how='outer', on=['codmpio','ano'],validate='1:1')

In [22]:
# rename columns 
rename_dict={'codmpio':'muncode','ano': 'year'}
cede=cede.rename(columns=rename_dict)
cede

Unnamed: 0,muncode,year,ac_cafe,p_cafe,H_coca,pobl_tot,lpop,y_cap_regalias
0,5001.0,2007.0,1027.0,821.599976,0.0,2265244.0,0.817683,33.031
1,5001.0,2008.0,1027.0,821.599976,0.0,2291378.0,0.829154,50.457
2,5001.0,2009.0,1027.0,800.000000,0.0,2317336.0,0.840419,126.854
3,5001.0,2010.0,1078.0,862.400024,0.0,2343049.0,0.851453,47.348
4,5001.0,2011.0,1058.0,846.400024,0.0,2368282.0,0.862165,50.911
...,...,...,...,...,...,...,...,...
43605,99773.0,1988.0,,,,,,0.000
43606,99773.0,1989.0,,,,,,0.000
43607,99773.0,1990.0,,,,,,0.000
43608,99773.0,1991.0,,,,,,0.000


In [23]:
cede.to_csv('CEDE_data.csv')