# Data Preparation 
In this notebook the final table that will be used for the analysis is prepared. The preparation includes: 
* only taking rural municipalities
* Create interation terms 
* Create dummy variables

#### TO DO:
* Eventually drop missings in analysis part
* Clarify oil and coofee production

In [2]:
# import necessary modules 
import pandas as pd 
import numpy as np
from HelpFunctions import create_interaction

In [None]:
# load data sources

# my data
base=pd.read_csv('Preprocessed data/BaseConflictFilled.csv',index_col=0)

# Dube & Vargas
DV=pd.read_stata("Dube & Vargas/origmun_violence_commodities.dta", convert_categoricals=False)

# take necessary variables and do some preprocessing
DV=DV[['year','region','origmun','oilprod88','cofint','rainfall','temperature']]
DV['muncode']=DV['origmun'].astype(int)

# fdrop not needed columns 
DV.drop(columns=['origmun'],inplace=True)

# drop combined municipalities 
DV=DV[DV['muncode']!=-2147483648]

  return values.astype(dtype, copy=copy)


In [4]:
base.columns

Index(['year', 'muncode', 'depcode', 'Nombre', 'munname', 'clashes',
       'govattacks', 'guerrattacks', 'parattacks', 'posdattacks', 'parmass',
       'guerrmass', 'posdmass', 'parsec', 'guerrsec', 'posdsec', 'n_parsec',
       'n_guerrsec', 'n_posdsec', 'causalities', 'posdsec_HRDAG',
       'parsec_HRDAG', 'guersec_HRDAG', 'ac_cafe', 'p_cafe', 'H_coca',
       'pobl_tot', '_prcmean', '_tempmean', 'regalias_cop',
       'prod_gravable_bls_kpc', 'oil_prod', 'oil_production', 'CapitalRevenue',
       'caprev_2006_percapita', 'caprev_2006', 'top3cof', 'pcoffee2006',
       'poil2006', 'coca99', 'linternalp', 'lop', 'lpop', 'ltop3cof',
       'lcaprev'],
      dtype='object')

In [5]:
# rename columns
DV.rename(columns={'oilprod88':'oilprod88_dv','cofint':'cofint_dv','rainfall':'rainfall_dv','temperature':'temperature_dv'},inplace=True)
base.rename(columns={'_prcmean':'rainfall','_tempmean':'temperature'},inplace=True)

In [6]:
DV

Unnamed: 0,year,region,oilprod88_dv,cofint_dv,rainfall_dv,temperature_dv,muncode
0,1988.0,1.0,0.0,4.3209,2047.0,16.500,5002
1,1989.0,1.0,0.0,4.3209,2047.0,16.500,5002
2,1990.0,1.0,0.0,4.3209,2047.0,16.500,5002
3,1991.0,1.0,0.0,4.3209,2047.0,16.500,5002
4,1992.0,1.0,0.0,4.3209,2047.0,16.500,5002
...,...,...,...,...,...,...,...
17851,2001.0,2.0,0.0,0.0000,1900.0,26.975,1366713600
17852,2002.0,2.0,0.0,0.0000,1900.0,26.975,1366713600
17853,2003.0,2.0,0.0,0.0000,1900.0,26.975,1366713600
17854,2004.0,2.0,0.0,0.0000,1900.0,26.975,1366713600


In [7]:
# since only using municipality level data one can group by 
DV=DV.groupby('muncode').mean().reset_index().drop(columns='year')

## 1. Merge my data with Dube and Vargas 
First join the information to my dataset. Then include only the muncipalities included in the study of Dube & Vargas.

In [8]:
# join information from Dube and Vargas 
base=base.merge(DV,how='left',on=['muncode'],validate='m:1')

# keep only the municpalities, that they use 
# merge municipality data 
DV_mun=DV[['muncode']]

#drop duplicates (only by municipality)
DV_mun=DV_mun.drop_duplicates()
# merge to base
base=base.merge(DV_mun,how='inner',on=['muncode'],validate='m:1')

## 2. Create Interaction terms
### To Do: 
* Create interactions for oil and coffee production

In [9]:
base[['rainfall','temperature','rainfall_dv','temperature_dv','ltop3cof']].describe()

Unnamed: 0,rainfall,temperature,rainfall_dv,temperature_dv,ltop3cof
count,12675.0,12675.0,33150.0,33150.0,33150.0
mean,22152.570903,2935.679374,1866.675903,21.277052,20.981994
std,11289.483665,54.280205,987.432739,4.960412,3.141998
min,773.701757,2802.832219,160.0,3.9,3.12149
25%,14461.645466,2890.007044,1151.0,17.6,20.992356
50%,20006.007514,2936.594508,1637.0,21.6,21.513867
75%,27291.813574,2986.119706,2350.0,26.200001,21.943499
max,97082.722322,3036.3812,9200.0,28.900002,22.109754


In [10]:
# divide coffee production 
base['p_cafe']=base['p_cafe']/1000

In [11]:
create_interaction(base,'p_cafe','linternalp')
create_interaction(base,'oil_production','lop')

create_interaction(base,'cofint_dv','linternalp')
create_interaction(base,'oilprod88_dv','lop')


create_interaction(base,'coca99','year')

# rainfall and temperature from Dube and Vargas
create_interaction(base,'rainfall_dv','ltop3cof')
create_interaction(base,'temperature_dv','ltop3cof')
create_interaction(base,'temperature_dv','rainfall_dv')
base.rename(columns={'temperature_dvxrainfall_dv':'rt_dv'},inplace=True)
create_interaction(base,'rt_dv','ltop3cof')

# rainfall and temperature from my collected data 
create_interaction(base,'rainfall','ltop3cof')
create_interaction(base,'temperature','ltop3cof')
create_interaction(base,'temperature','rainfall')
base.rename(columns={'temperaturexrainfall':'rt'},inplace=True)
create_interaction(base,'rt','ltop3cof')




## 3. Create Dummy variables

In [12]:
# year y Region dummies 
base=pd.get_dummies(base,prefix='_Rreg',columns=['region'])

for region in range(1,5):
   base[f'_RregXyear_{region}']=base['year']*base[f'_Rreg_{region}.0']

In [13]:
# for demobilization 
base['period']=['afterAUC' if (year >2004 and year<2016) else 'AfterFARC' if year>2015 else 'beforeDEM' for year in base['year']]

## 4. Prepare output file

In [14]:
# check that entries per year and muncode are unique
base[base.duplicated(subset=['year','muncode'], keep=False)]

Unnamed: 0,year,muncode,depcode,Nombre,munname,clashes,govattacks,guerrattacks,parattacks,posdattacks,...,rtxltop3cof,_Rreg_1.0,_Rreg_2.0,_Rreg_3.0,_Rreg_4.0,_RregXyear_1,_RregXyear_2,_RregXyear_3,_RregXyear_4,period


In [15]:
base.drop(columns=['parsec', 'guerrsec', 'posdsec','pobl_tot','regalias_cop', 'CapitalRevenue', 'caprev_2006_percapita',
       'caprev_2006','pcoffee2006', 'poil2006'],inplace=True)
base.columns

Index(['year', 'muncode', 'depcode', 'Nombre', 'munname', 'clashes',
       'govattacks', 'guerrattacks', 'parattacks', 'posdattacks', 'parmass',
       'guerrmass', 'posdmass', 'n_parsec', 'n_guerrsec', 'n_posdsec',
       'causalities', 'posdsec_HRDAG', 'parsec_HRDAG', 'guersec_HRDAG',
       'ac_cafe', 'p_cafe', 'H_coca', 'rainfall', 'temperature',
       'prod_gravable_bls_kpc', 'oil_prod', 'oil_production', 'top3cof',
       'coca99', 'linternalp', 'lop', 'lpop', 'ltop3cof', 'lcaprev',
       'oilprod88_dv', 'cofint_dv', 'rainfall_dv', 'temperature_dv',
       'p_cafexlinternalp', 'oil_productionxlop', 'cofint_dvxlinternalp',
       'oilprod88_dvxlop', 'coca99xyear', 'rainfall_dvxltop3cof',
       'temperature_dvxltop3cof', 'rt_dv', 'rt_dvxltop3cof',
       'rainfallxltop3cof', 'temperaturexltop3cof', 'rt', 'rtxltop3cof',
       '_Rreg_1.0', '_Rreg_2.0', '_Rreg_3.0', '_Rreg_4.0', '_RregXyear_1',
       '_RregXyear_2', '_RregXyear_3', '_RregXyear_4', 'period'],
      dtype='object'

In [None]:
base.to_csv('Preprocessed data/BaseAnalysis.csv')

In [17]:
pd.set_option('display.max_columns', None)
base.groupby('year').count()

Unnamed: 0_level_0,muncode,depcode,Nombre,munname,clashes,govattacks,guerrattacks,parattacks,posdattacks,parmass,guerrmass,posdmass,n_parsec,n_guerrsec,n_posdsec,causalities,posdsec_HRDAG,parsec_HRDAG,guersec_HRDAG,ac_cafe,p_cafe,H_coca,rainfall,temperature,prod_gravable_bls_kpc,oil_prod,oil_production,top3cof,coca99,linternalp,lop,lpop,ltop3cof,lcaprev,oilprod88_dv,cofint_dv,rainfall_dv,temperature_dv,p_cafexlinternalp,oil_productionxlop,cofint_dvxlinternalp,oilprod88_dvxlop,coca99xyear,rainfall_dvxltop3cof,temperature_dvxltop3cof,rt_dv,rt_dvxltop3cof,rainfallxltop3cof,temperaturexltop3cof,rt,rtxltop3cof,_Rreg_1.0,_Rreg_2.0,_Rreg_3.0,_Rreg_4.0,_RregXyear_1,_RregXyear_2,_RregXyear_3,_RregXyear_4,period
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1988,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,0,0,0,975,975,975,0,975,975,975,975,975,227,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1989,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,0,0,0,975,975,975,975,975,975,975,975,975,254,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1990,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,0,0,0,975,975,975,975,975,975,975,975,975,335,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1991,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,0,0,0,975,975,975,975,975,975,975,975,975,311,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1992,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,0,0,0,975,975,975,975,975,975,975,975,975,385,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1993,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,975,0,0,975,975,975,975,975,975,975,975,975,410,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1994,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,975,0,0,975,975,975,975,975,975,975,975,975,431,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1995,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,975,0,0,975,975,975,975,975,975,975,975,975,525,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1996,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,975,0,0,975,975,975,975,975,975,975,975,975,832,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975
1997,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,975,0,0,975,0,0,975,975,975,975,975,975,975,975,975,730,975,956,975,975,0,975,956,975,975,975,975,975,975,0,0,0,0,975,975,975,975,975,975,975,975,975


# Create output dataset without municipalities that produce both commodities

In [18]:
base.columns

Index(['year', 'muncode', 'depcode', 'Nombre', 'munname', 'clashes',
       'govattacks', 'guerrattacks', 'parattacks', 'posdattacks', 'parmass',
       'guerrmass', 'posdmass', 'n_parsec', 'n_guerrsec', 'n_posdsec',
       'causalities', 'posdsec_HRDAG', 'parsec_HRDAG', 'guersec_HRDAG',
       'ac_cafe', 'p_cafe', 'H_coca', 'rainfall', 'temperature',
       'prod_gravable_bls_kpc', 'oil_prod', 'oil_production', 'top3cof',
       'coca99', 'linternalp', 'lop', 'lpop', 'ltop3cof', 'lcaprev',
       'oilprod88_dv', 'cofint_dv', 'rainfall_dv', 'temperature_dv',
       'p_cafexlinternalp', 'oil_productionxlop', 'cofint_dvxlinternalp',
       'oilprod88_dvxlop', 'coca99xyear', 'rainfall_dvxltop3cof',
       'temperature_dvxltop3cof', 'rt_dv', 'rt_dvxltop3cof',
       'rainfallxltop3cof', 'temperaturexltop3cof', 'rt', 'rtxltop3cof',
       '_Rreg_1.0', '_Rreg_2.0', '_Rreg_3.0', '_Rreg_4.0', '_RregXyear_1',
       '_RregXyear_2', '_RregXyear_3', '_RregXyear_4', 'period'],
      dtype='object'

In [20]:
# keep municipalities that only produce one commodity or none 
base_f=base.loc[(base['oilprod88_dv']==0) | (base['cofint_dv']==0)]
base_f.groupby('year').count()

Unnamed: 0_level_0,muncode,depcode,Nombre,munname,clashes,govattacks,guerrattacks,parattacks,posdattacks,parmass,guerrmass,posdmass,n_parsec,n_guerrsec,n_posdsec,causalities,posdsec_HRDAG,parsec_HRDAG,guersec_HRDAG,ac_cafe,p_cafe,H_coca,rainfall,temperature,prod_gravable_bls_kpc,oil_prod,oil_production,top3cof,coca99,linternalp,lop,lpop,ltop3cof,lcaprev,oilprod88_dv,cofint_dv,rainfall_dv,temperature_dv,p_cafexlinternalp,oil_productionxlop,cofint_dvxlinternalp,oilprod88_dvxlop,coca99xyear,rainfall_dvxltop3cof,temperature_dvxltop3cof,rt_dv,rt_dvxltop3cof,rainfallxltop3cof,temperaturexltop3cof,rt,rtxltop3cof,_Rreg_1.0,_Rreg_2.0,_Rreg_3.0,_Rreg_4.0,_RregXyear_1,_RregXyear_2,_RregXyear_3,_RregXyear_4,period
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
1988,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,0,0,0,961,961,961,0,961,961,961,961,961,220,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1989,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,0,0,0,961,961,961,961,961,961,961,961,961,247,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1990,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,0,0,0,961,961,961,961,961,961,961,961,961,324,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1991,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,0,0,0,961,961,961,961,961,961,961,961,961,306,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1992,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,0,0,0,961,961,961,961,961,961,961,961,961,378,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1993,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,961,0,0,961,961,961,961,961,961,961,961,961,402,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1994,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,961,0,0,961,961,961,961,961,961,961,961,961,426,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1995,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,961,0,0,961,961,961,961,961,961,961,961,961,519,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1996,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,961,0,0,961,961,961,961,961,961,961,961,961,822,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961
1997,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,0,0,961,0,0,961,961,961,961,961,961,961,961,961,718,961,942,961,961,0,961,942,961,961,961,961,961,961,0,0,0,0,961,961,961,961,961,961,961,961,961


In [None]:
base_f.to_csv('Preprocessed data/BaseAnalysis_nospill.csv')