# Validation & Filling against Dube & Vargas
The idea is to take overlapping time periods and validate my data against the data from Dube & Vargas. 
In a first step the correlation is computed and in a later step a 1 to 1 validation is performed if the source is the same. 

### Notes
* Kidnappings cannot be compared since DV use political kidnappings and I use a broader category 
* Rainfall and Tempreature could be compared ate some point but for now my data has 2007-2019 and DV have 1995

In [3]:
import pandas as pd 
import numpy as np

# Validation

### Define functions

In [4]:
def get_correlation(val_df,col_dv,col_base):

    # only take entries where both values are available 
    sub_df=val_df[[f'{col_dv}',f'{col_base}','_merge']][val_df._merge=='both']
    sub_df.dropna(inplace=True)

    # take correlation 
    corr= sub_df[f'{col_dv}'].corr(sub_df[f'{col_base}'])

    return corr


### Load and manipulate data to get it in the same shape

In [None]:
# load datasets
DV=pd.read_stata("Dube & Vargas/origmun_violence_commodities.dta", convert_categoricals=False)
base=pd.read_csv('Preprocessed data/BaseConflict.csv',index_col=0)

In [6]:
# create log variables
base['linternalp']=np.log(base['pcoffee2006'])
base['ltop3cof']=np.log(base['top3cof'])
base['lop']=np.log(base['poil2006'])

In [7]:
# do adjustments for municipality code
DV['muncode']=DV['origmun'].astype(int)
# from not needed columns 
DV.drop(columns=['origmun'],inplace=True)

# drop combined municipalities 
DV=DV[DV['muncode']!=-2147483648]

  return values.astype(dtype, copy=copy)


In [8]:
val_df=DV.merge(base, how='outer',on=['muncode','year'],suffixes=('_dv','_base'),validate='1:1',indicator=True)

In [9]:
val_df.head(2)

Unnamed: 0,year,region,department,department_name,origmun_name,multsplit,gueratt,paratt,clashes_dv,casualties,...,caprev_2006,lcaprev_base,top3cof,pcoffee2006,poil2006,coca99,linternalp_base,ltop3cof_base,lop_base,_merge
0,1988.0,1.0,5.0,5.0,5002.0,0.0,0.0,0.0,0.0,0.0,...,,,,709790.986715,64102.847584,0.0,13.472726,,11.068244,both
1,1989.0,1.0,5.0,5.0,5002.0,0.0,0.0,0.0,0.0,0.0,...,,,1308774000.0,712126.733642,79626.343583,0.0,13.476011,20.992356,11.2851,both


## Correlations
To get an overview and a feeling the correlation between our variables and the variables from dube a Vargas is computed.

In [10]:
# create object that stores names of columns to be compared
col_pairs=[('lpop_dv','lpop_base'),
           ('coca','H_coca'),
           ('lcaprev_dv','lcaprev_base'),
           ('linternalp_dv','linternalp_base'),
           ('lop_dv','lop_base'),
           ('ltop3cof_dv','ltop3cof_base'),
           ('rainfall','_prcmean'),
           ('temperature','_tempmean')]
# create list to store json result
correlations=[]
for pair in col_pairs:
    corr={}
    corr['colmun_name']=pair[1]
    corr['DV_name']=pair[0]
    corr['correlation']=get_correlation(val_df,pair[0],pair[1])
    correlations.append(corr)


In [11]:
val_df[['lcaprev_dv','lcaprev_base']].describe()

Unnamed: 0,lcaprev_dv,lcaprev_base
count,11726.0,23781.0
mean,7.052247,7.241722
std,2.482476,12.038167
min,-6.907755,-6.907755
25%,6.863835,-6.907755
50%,7.790505,14.965575
75%,8.393613,17.955411
max,11.45597,23.832676


In [12]:
base[base['munname']=='AGUAZUL']

Unnamed: 0,year,muncode,depcode,Nombre,munname,clashes,govattacks,guerrattacks,parattacks,posdattacks,...,caprev_2006_percapita,caprev_2006,lcaprev,top3cof,pcoffee2006,poil2006,coca99,linternalp,ltop3cof,lop
35904,1988,85010,85,CASANARE,AGUAZUL,0.0,0.0,0.0,0.0,0.0,...,,,,,709790.986715,64102.847584,0,13.472726,,11.068244
35905,1989,85010,85,CASANARE,AGUAZUL,0.0,0.0,1.0,0.0,0.0,...,,,,1308774000.0,712126.733642,79626.343583,0,13.476011,20.992356,11.2851
35906,1990,85010,85,CASANARE,AGUAZUL,0.0,0.0,1.0,0.0,0.0,...,,,,1270728000.0,680456.30396,105496.209859,0,13.430519,20.962856,11.56643
35907,1991,85010,85,CASANARE,AGUAZUL,0.0,0.0,0.0,0.0,0.0,...,,,,1467416000.0,634592.062209,85210.960679,0,13.360738,21.106769,11.352885
35908,1992,85010,85,CASANARE,AGUAZUL,2.0,0.0,1.0,0.0,0.0,...,,,,1259467000.0,485647.559162,70169.310523,0,13.093238,20.953955,11.158666
35909,1993,85010,85,CASANARE,AGUAZUL,0.0,0.0,1.0,0.0,0.0,...,,,,1272489000.0,412476.909329,58313.75627,0,12.929936,20.964241,10.973593
35910,1994,85010,85,CASANARE,AGUAZUL,6.0,0.0,5.0,0.0,0.0,...,,,,1138411000.0,560167.370612,46521.928106,0,13.235991,20.852899,10.747679
35911,1995,85010,85,CASANARE,AGUAZUL,5.0,0.0,3.0,0.0,0.0,...,,,,947652300.0,587874.166504,45611.498806,0,13.284268,20.669498,10.727915
35912,1996,85010,85,CASANARE,AGUAZUL,6.0,0.0,7.0,0.0,0.0,...,,,,1140792000.0,510084.877598,51995.531698,0,13.142332,20.854989,10.858913
35913,1997,85010,85,CASANARE,AGUAZUL,3.0,0.0,2.0,0.0,0.0,...,,,,1176321000.0,679003.425082,44731.965203,0,13.428381,20.885657,10.708444


In [13]:
val_df[['year','oil_prod','oilprod88','muncode']][val_df['oilprod88']!=0]

Unnamed: 0,year,oil_prod,oilprod88,muncode
1386,1988.0,0.000000,0.141147,5585
1387,1989.0,3.884259,0.141147,5585
1388,1990.0,4.327162,0.141147,5585
1389,1991.0,4.444736,0.141147,5585
1390,1992.0,3.465684,0.141147,5585
...,...,...,...,...
37803,2017.0,0.000000,,99773
37804,2018.0,0.000000,,99773
37805,2019.0,0.000000,,99773
37806,2020.0,0.000000,,99773


In [14]:
pd.DataFrame(correlations)

Unnamed: 0,colmun_name,DV_name,correlation
0,lpop_base,lpop_dv,0.995119
1,H_coca,coca,0.929948
2,lcaprev_base,lcaprev_dv,0.323822
3,linternalp_base,linternalp_dv,0.99907
4,lop_base,lop_dv,0.983295
5,ltop3cof_base,ltop3cof_dv,0.950776
6,_prcmean,rainfall,
7,_tempmean,temperature,


## Coca exploration
Since coca intensity is a bit lower I look further into it.

In [15]:
a=('H_coca','coca')
col_dv=a[0]
col_base=a[1]
sub_df=val_df[[f'{col_dv}',f'{col_base}','_merge']][val_df._merge=='both']
sub_df.dropna(inplace=True)

In [16]:
sub_df['coca1000']=round(sub_df['coca']*1000,2)
sub_df.corr()

  sub_df.corr()


Unnamed: 0,H_coca,coca,coca1000
H_coca,1.0,0.929948,0.929948
coca,0.929948,1.0,1.0
coca1000,0.929948,1.0,1.0


In [17]:
sub_df

Unnamed: 0,H_coca,coca,_merge,coca1000
6,0.00,0.00000,both,0.00
11,0.00,0.00000,both,0.00
12,0.00,0.00000,both,0.00
13,0.00,0.00000,both,0.00
14,0.00,0.00000,both,0.00
...,...,...,...,...
17815,7943.17,7.94317,both,7943.17
17816,4846.00,4.84600,both,4846.00
17817,3758.00,3.75800,both,3758.00
17818,4629.00,4.62900,both,4629.00


In [18]:
# identify how many have same observations
print('Proportion of same observation:',(sub_df['H_coca']==sub_df['coca1000']).mean())
print('Proportion of 0 in my data:',(sub_df['H_coca']==0).mean())
print('Proportion of 0 in Duve and Vargas:',(sub_df['coca1000']==0).mean())

Proportion of same observation: 0.9437179487179487
Proportion of 0 in my data: 0.8811538461538462
Proportion of 0 in Duve and Vargas: 0.8732051282051282


In [19]:
# see from the non zero how many are equal 
no_zero=sub_df[(sub_df['H_coca']!=0)|(sub_df['coca']!=0)]
print('Proportion of same observation (without zeroes):',(no_zero['H_coca']==no_zero['coca1000']).mean())

# do the same but with more wasy rounding 
no_zero['coca1000_1']=round(no_zero['coca1000'])
no_zero['H_coca_1']=round(no_zero['H_coca'])
print('Proportion of same observation (without zeroes, 1 dc round):',(no_zero['H_coca_1']==no_zero['coca1000_1']).mean())

Proportion of same observation (without zeroes): 0.56187624750499
Proportion of same observation (without zeroes, 1 dc round): 0.8193612774451098


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_zero['coca1000_1']=round(no_zero['coca1000'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_zero['H_coca_1']=round(no_zero['H_coca'])


# Check coverage

In [20]:
col_pairs

[('lpop_dv', 'lpop_base'),
 ('coca', 'H_coca'),
 ('lcaprev_dv', 'lcaprev_base'),
 ('linternalp_dv', 'linternalp_base'),
 ('lop_dv', 'lop_base'),
 ('ltop3cof_dv', 'ltop3cof_base'),
 ('rainfall', '_prcmean'),
 ('temperature', '_tempmean')]

In [21]:
val_df.groupby('year').count()[['lpop_dv', 'lpop_base']]

Unnamed: 0_level_0,lpop_dv,lpop_base
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1988.0,992,0
1989.0,992,0
1990.0,992,0
1991.0,992,0
1992.0,992,0
1993.0,992,1043
1994.0,992,1044
1995.0,992,1051
1996.0,992,1061
1997.0,992,1070


In [22]:
val_df.groupby('year').count()[['ltop3cof_dv', 'ltop3cof_base']]

Unnamed: 0_level_0,ltop3cof_dv,ltop3cof_base
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1988.0,992,0
1989.0,992,1103
1990.0,992,1103
1991.0,992,1103
1992.0,992,1103
1993.0,992,1103
1994.0,992,1103
1995.0,992,1103
1996.0,992,1103
1997.0,992,1103


In [23]:
val_df.groupby('year').count()[['lcaprev_dv','lcaprev_base']]

Unnamed: 0_level_0,lcaprev_dv,lcaprev_base
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1988.0,228,0
1989.0,255,0
1990.0,336,0
1991.0,312,0
1992.0,386,0
1993.0,411,0
1994.0,432,0
1995.0,526,0
1996.0,834,0
1997.0,731,0


# Fill in missings 
Where correlation was high the data from Dube and Vargas will be used to fill in missing values. The variables are:
* lpop: log population in millions
* ltop3cof: log of exports from other top 3 coffee exporters 

In [24]:
# use info from dube & vargas for lpop
DV_pop=DV[['year','muncode','lpop','ltop3cof','lcaprev']]

# merge 
base=base.merge(DV_pop,how='left',on=['year','muncode'],suffixes=('_base','_dv'),validate='m:1')

In [25]:
base.head(2)

Unnamed: 0,year,muncode,depcode,Nombre,munname,clashes,govattacks,guerrattacks,parattacks,posdattacks,...,top3cof,pcoffee2006,poil2006,coca99,linternalp,ltop3cof_base,lop,lpop_dv,ltop3cof_dv,lcaprev_dv
0,1988,5001,5,ANTIOQUIA,MEDELLÍN,9.0,1.0,7.0,0.0,0.0,...,,709790.986715,64102.847584,0,13.472726,,11.068244,,,
1,1989,5001,5,ANTIOQUIA,MEDELLÍN,9.0,0.0,0.0,0.0,0.0,...,1308774000.0,712126.733642,79626.343583,0,13.476011,20.992356,11.2851,,,


### Population

In [26]:
# fill population
miss=base.lpop_base.isna().sum()
print('number of missings before filling:', miss)
# check correlation 
corr=base['lpop_base'].corr(base['lpop_dv'])
print('correlation between both datasets:',corr)

# fill nans
base['lpop']=base['lpop_base'].fillna(base['lpop_dv'])

# check for missings 
miss=base.lpop.isna().sum()
print('number of missings after filling:', miss)

# drop uselles columns 
base.drop(['lpop_base','lpop_dv'],axis=1,inplace=True)

number of missings before filling: 5865
correlation between both datasets: 0.995119192392697
number of missings after filling: 990


### Top 3 coffee exporters 

In [27]:
# fill top3coffee
miss=base.ltop3cof_base.isna().sum()
print('number of missings before filling:', miss)
# check correlation 
corr=base['ltop3cof_base'].corr(base['ltop3cof_dv'])
print('correlation between both datasets:',corr)

# fill nans
base['ltop3cof']=base['ltop3cof_base'].fillna(base['ltop3cof_dv'])

# check for missings 
miss=base.ltop3cof.isna().sum()
print('number of missings after filling:', miss)

# drop uselles columns 
base.drop(['ltop3cof_base','ltop3cof_dv'],axis=1,inplace=True)

number of missings before filling: 1103
correlation between both datasets: 0.950776103986499
number of missings after filling: 128


### Capital Revenue

In [28]:
# fill top3coffee
miss=base.lcaprev_base.isna().sum()
print('number of missings before filling:', miss)
# check correlation 
corr=base['lcaprev_base'].corr(base['lcaprev_dv'])
print('correlation between both datasets:',corr)

# fill nans
base['lcaprev']=base['lcaprev_base'].fillna(base['lcaprev_dv'])

# check for missings 
miss=base.ltop3cof.isna().sum()
print('number of missings after filling:', miss)

# drop uselles columns 
base.drop(['lcaprev_base','lcaprev_dv'],axis=1,inplace=True)

number of missings before filling: 13721
correlation between both datasets: 0.3238223665732423
number of missings after filling: 128


In [29]:
base.head(2)

Unnamed: 0,year,muncode,depcode,Nombre,munname,clashes,govattacks,guerrattacks,parattacks,posdattacks,...,caprev_2006,top3cof,pcoffee2006,poil2006,coca99,linternalp,lop,lpop,ltop3cof,lcaprev
0,1988,5001,5,ANTIOQUIA,MEDELLÍN,9.0,1.0,7.0,0.0,0.0,...,,,709790.986715,64102.847584,0,13.472726,11.068244,,,
1,1989,5001,5,ANTIOQUIA,MEDELLÍN,9.0,0.0,0.0,0.0,0.0,...,,1308774000.0,712126.733642,79626.343583,0,13.476011,11.2851,,20.992356,


In [None]:
base.to_csv('Preprocessed data/BaseConflictFilled.csv')