# ANH Preprocessing
In this script the oil production data from the ANH (agencia Nacional de Hidrocarburos) is preprocessed. Muncipality names are mated with codes. 

In [1]:
import pandas as pd 
import os
from unidecode import unidecode
from ANHHelpFunctions import split_municipios
from ANHHelpFunctions import update_department

In [2]:
# set current directory to master thesis folder
os.chdir('..')

## 1. Read Data 
* Read table with municipality code to join the oil production data 
* Read the oil production data 

In [3]:
# read municpality table 
# read table 
muni=pd.read_excel("DANE\DIVIPOLA_Municipios.xlsx",skiprows=10)

# get only the information on municipalities
muni=muni[muni['Unnamed: 4']=='Municipio']

# rename columns 
rename_dict={'Nombre':'deptname','Código ':'depcode','Código .1':'muncode','Nombre.1': 'munname'}
muni=muni.rename(columns=rename_dict)

# get only relevant columns 
muni=muni[['depcode','deptname','muncode','munname']]

# code muncode as integer
muni['muncode']=[int(code) for code in muni['muncode']]

In [4]:
muni.head(5)

Unnamed: 0,depcode,deptname,muncode,munname
0,5,ANTIOQUIA,5001,MEDELLÍN
1,5,ANTIOQUIA,5002,ABEJORRAL
2,5,ANTIOQUIA,5004,ABRIAQUÍ
3,5,ANTIOQUIA,5021,ALEJANDRÍA
4,5,ANTIOQUIA,5030,AMAGÁ


In [5]:
# read oil production data
oil=pd.read_excel("AHN/1202531060007582_00002.xlsx",skiprows=2)

## 2. Prepare Data 
For the names to match is necessary to have all names in the same case and remove accents. Further manual adjustments will be performed. Addtionally, some production is listed for union of municipalities. For this municipalities I will split the production in 2 and assign half to each municipality.

In [6]:
# turn to upper case
oil['DEPARTAMENTO(S)']=oil['DEPARTAMENTO(S)'].str.upper()
oil['MUNICIPIO(S)']=(oil['MUNICIPIO(S)'].str.upper()).astype(str)

# remove accents 
muni['munname']=[unidecode(name) for name in muni['munname']]
muni['deptname']=[unidecode(name) for name in muni['deptname']]
oil['DEPARTAMENTO(S)']=[unidecode(name) for name in oil['DEPARTAMENTO(S)']]
oil['MUNICIPIO(S)']=[unidecode(name) for name in oil['MUNICIPIO(S)']]

In [7]:
oil.head(3)

Unnamed: 0,DEPARTAMENTO(S),MUNICIPIO(S),1989,1990,1991,1992,1993,1994,1995,1996,...,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
0,ANTIOQUIA,PUERTO BERRIO,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,36,0
1,ANTIOQUIA,PUERTO NARE,1284426,1805465,1846555,1562545,1346559,1095867,1057995,998005,...,748603,697850,708517,599364,479528,417605,389030,363113,327788,268635
2,ANTIOQUIA,PUERTO NARE - PUERTO TRIUNFO,5199665,5043394,5196362,3806278,3727494,3888420,3442626,3042644,...,2240221,1973449,1706914,1491128,1075554,990189,915522,774455,654888,510399


In [8]:
# split entries of municipalities that have multiple municipalities in them 
expanded_rows = []
for _, row in oil.iterrows():
    expanded_rows.extend(split_municipios(row))

# Turn to Dataframe
oil_expanded = pd.DataFrame(expanded_rows)

# reset the index 
oil_expanded.reset_index(drop=True, inplace=True)


In [9]:
muni.columns

Index(['depcode', 'deptname', 'muncode', 'munname'], dtype='object')

In [10]:
# change department for multiple entries of municipalities that where in different departments
oil_expanded=update_department(oil_expanded,"PUERTO WILCHES","BOLIVAR","SANTANDER")
oil_expanded=update_department(oil_expanded,"PUERTO NARE","BOYACA","ANTIOQUIA")

Updated the following entries:
   DEPARTAMENTO(S)    MUNICIPIO(S)
13       SANTANDER  PUERTO WILCHES
Updated the following entries:
   DEPARTAMENTO(S) MUNICIPIO(S)
25       ANTIOQUIA  PUERTO NARE


In [11]:
# sum production for municipalities that were as single and multiple entries 
oil_grouped=oil_expanded.groupby(['DEPARTAMENTO(S)','MUNICIPIO(S)']).sum().reset_index()

# change names
oil_grouped.replace('MOMPOS','SANTA CRUZ DE MOMPOX',inplace=True)
oil_grouped.replace('SAN JOSE DE FRAGUA','SAN JOSE DEL FRAGUA',inplace=True)
oil_grouped.replace('SAN CARLOS GUAROA','SAN CARLOS DE GUAROA',inplace=True)
oil_grouped.replace('VISTA HERMOSA','VISTAHERMOSA',inplace=True)
oil_grouped.replace('CUCUTA','SAN JOSE DE CUCUTA',inplace=True)
oil_grouped.replace('VALLE DE GUAMUEZ','VALLE DEL GUAMUEZ',inplace=True)

In [12]:
oil_m=oil_grouped.merge(muni,how='left',left_on=['DEPARTAMENTO(S)','MUNICIPIO(S)'], right_on=['deptname','munname'],indicator=True,validate='1:1')

In [13]:
oil_m[oil_m['_merge']=='left_only']

Unnamed: 0,DEPARTAMENTO(S),MUNICIPIO(S),1989,1990,1991,1992,1993,1994,1995,1996,...,2006,2007,2008,2009,2010,depcode,deptname,muncode,munname,_merge
39,HUILA,(BLANK),0.0,0.0,424.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,,,left_only
59,META,N.D.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,,,,,left_only
67,N.N.,N.N.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10080.0,11858.0,17812.0,110283.0,232131.0,,,,,left_only
107,TOTAL,,147511008.0,160119815.0,155034549.0,160549278.0,165751598.0,166189769.0,212692584.0,228770784.0,...,192504185.0,193849282.0,215082927.0,244770181.0,286840579.0,,,,,left_only


In [14]:
oil_m.head(3)

Unnamed: 0,DEPARTAMENTO(S),MUNICIPIO(S),1989,1990,1991,1992,1993,1994,1995,1996,...,2006,2007,2008,2009,2010,depcode,deptname,muncode,munname,_merge
0,ANTIOQUIA,PUERTO BERRIO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,36.0,0.0,5,ANTIOQUIA,5579.0,PUERTO BERRIO,both
1,ANTIOQUIA,PUERTO NARE,3884258.5,4327162.0,4444736.0,3465684.0,3210306.0,3040077.0,2779308.0,2519327.0,...,1212343.5,1127259.0,968243.5,828907.5,664223.0,5,ANTIOQUIA,5585.0,PUERTO NARE,both
2,ANTIOQUIA,PUERTO TRIUNFO,2599832.5,2521697.0,2598181.0,1903139.0,1863747.0,1944210.0,1721313.0,1521322.0,...,495094.5,457761.0,387227.5,327444.0,255199.5,5,ANTIOQUIA,5591.0,PUERTO TRIUNFO,both


In [15]:
# keep only relevant entries 
oil_m=oil_m[oil_m['_merge']=='both'].drop(columns=['DEPARTAMENTO(S)','MUNICIPIO(S)','_merge','depcode','deptname','munname'])

In [16]:
oil_m.columns=oil_m.columns.astype(str)
oil_m_long= pd.melt(
    oil_m, 
    id_vars=['muncode'],  
    value_vars=[str(year) for year in range(1989, 2011)],  
    var_name='year',  
    value_name='oil_prod' 
)

In [19]:
oil_m_long['oil_prod']=oil_m_long['oil_prod']/1000000

In [20]:
oil_m_long.to_csv('AHN/oil_1989_2010.csv')