In [1]:
import pandas as pd
import requests
import csv

In [2]:
# process columns like major head as they are converted to numbers, which shouldnt happen
# MH	SMH	MINH	GSH	SH	DH	SDH
# data.columns[2:9]

In [3]:
fill_config = {u'MH': 4, 
 u'SMH': 2, 
 u'MINH': 3, 
 u'GSH': 2, 
 u'SH': 2, 
 u'DH': 3, 
 u'SDH': 3}

In [4]:
# this data should not be null, thus no error handling for nan values
def fill_process_col(data, n_fill):
    return data.astype(int).astype(str).apply(lambda x: '0' * (n_fill - len(x.strip())) + x.strip())
    

In [5]:
def get_ap_treasury_data(year, district_code):
    url = 'https://treasury.ap.gov.in/budreq/index.php?service=REPORTN'
    params = {'fin_year': year,
    'district': district_code,
    'chkall': '',
    'april': '04',
    'may': '05',
    'june': '06',
    'july': '07',
    'aug': '08',
    'sep': '09',
    'oct':'10',
    'nov':'11',
    'dec':'12',
    'jan':'01',
    'feb':'02',
    'mar':'03',
    'mh':'MH',
    'smh':'SMH',
    'minh':'MINH',
    'gsh':'GSH',
    'sh':'SH',
    'dh':'DH',
    'sdh':'SDH',
    'npn':'NPN',
    'dist':'DIST',
    'sub':'Submit'}
    resp = requests.get(url=url, params=params)
    print(resp.status_code)
    parsed_data = pd.read_html(resp.text, header=[0])
    data = parsed_data[0]
    return data[data['MH TYPE'] != 'TOTAL']

In [6]:
hoas_selected_cols = [
 'mh',
 'mh_desc',
 'smh',
 'smh_desc',
 'minh',
 'minh_desc',
 'gsh',
 'gsh_desc',
 'sh',
 'sh_desc',
 'dh',
 'dh_desc',
 'sdh',
 'sdh_desc',
 'cv',
 'mhtype',
 'hoa']

def get_haos_data():
    hoas = pd.read_csv('hoa_2018.csv', dtype=str)
    hoas.columns = hoas.iloc[0]
    hoas_processed = hoas.drop('0', axis=1).drop(0)
    return hoas_processed

def extract_ap_districts_treasury_data(districts, years):
    for district in districts:
        district_data = pd.DataFrame()
        for year in years:
            data = get_ap_treasury_data(year, district)
            for col in fill_config:
                data[col] = fill_process_col(data[col], fill_config[col])
            time_cols = data.columns[11:23]
            cat_cols = data.columns[1:11]
            processed_data = pd.DataFrame()
            for index, row in data.iterrows():
                cat_info = row[cat_cols]
                for t_col in time_cols:
                    cat_info['MONTH'] = t_col
                    cat_info['AMOUNT'] = row[t_col]
                    processed_data = processed_data.append(cat_info, ignore_index=False)
            processed_data.reset_index(inplace=True, drop=True)
            hoas_group_cols = [key.lower() for key in  fill_config.keys()]
            hoas_processed = get_haos_data()
            hoas_unique = hoas_processed.drop_duplicates('hoa')
            data_group_cols = fill_config.keys()
            combined_data = pd.merge(left=processed_data, left_on=list(data_group_cols), 
                                     right=hoas_unique[hoas_selected_cols], right_on=list(hoas_group_cols),
                                     how='left')
            combined_data.drop(hoas_group_cols, axis=1, inplace=True)
            district_name = combined_data['DISTRICT'].iloc[0]
            combined_data.to_csv('../csvs/ditrict_treasuries/ap/{}/{}.csv'.format(year, district_name),
                                index=False, quoting=csv.QUOTE_ALL)
            print('Completed for ', district, year)

In [7]:
extract_ap_districts_treasury_data(['0100', '0200', '0300', '0400'], ['2016-17', '2017-18'])

200
Completed for  0100 2016-17
200
Completed for  0100 2017-18
200


KeyboardInterrupt: 

In [73]:
data = get_ap_treasury_data('2016-17', '0500')

200


In [74]:
for col in fill_config:
    data[col] = fill_process_col(data[col], fill_config[col])

In [75]:
data[data.columns[2:9]]

Unnamed: 0,MH,SMH,MINH,GSH,SH,DH,SDH
0,0028,00,107,00,01,000,000
2,0028,00,107,00,04,000,000
4,0029,00,101,00,01,000,000
6,0029,00,101,00,02,000,000
8,0029,00,106,00,01,000,000
10,0029,00,107,00,01,000,000
12,0029,00,800,00,01,000,000
14,0029,00,800,00,81,000,000
16,0030,01,101,00,01,000,000
18,0030,01,102,00,01,000,000


In [76]:
# Now I want to put it in a format that is usable as a time series
data.columns[11:23]

Index(['APRIL', 'MAY', 'JUNE', 'JULY', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC',
       'JAN', 'FEB', 'MAR'],
      dtype='object')

In [77]:
time_cols = data.columns[11:23]
cat_cols = data.columns[1:11]

In [78]:
processed_data = pd.DataFrame()

In [79]:
for index, row in data.iterrows():
    cat_info = row[cat_cols]
    for t_col in time_cols:
        cat_info['MONTH'] = t_col
        cat_info['AMOUNT'] = row[t_col]
        processed_data = processed_data.append(cat_info, ignore_index=False)

In [80]:
processed_data.reset_index(inplace=True, drop=True)
processed_data

Unnamed: 0,AMOUNT,DH,DISTRICT,GSH,MH,MH TYPE,MINH,MONTH,NPN,SDH,SH,SMH
0,11315565.0,000,KRISHNA,00,0028,R,107,APRIL,N,000,01,00
1,12981956.0,000,KRISHNA,00,0028,R,107,MAY,N,000,01,00
2,12462097.0,000,KRISHNA,00,0028,R,107,JUNE,N,000,01,00
3,14215216.0,000,KRISHNA,00,0028,R,107,JULY,N,000,01,00
4,12888154.0,000,KRISHNA,00,0028,R,107,AUG,N,000,01,00
5,13221445.0,000,KRISHNA,00,0028,R,107,SEP,N,000,01,00
6,12939059.0,000,KRISHNA,00,0028,R,107,OCT,N,000,01,00
7,12977087.0,000,KRISHNA,00,0028,R,107,NOV,N,000,01,00
8,12488969.0,000,KRISHNA,00,0028,R,107,DEC,N,000,01,00
9,13189411.0,000,KRISHNA,00,0028,R,107,JAN,N,000,01,00


In [81]:
# Now we want to get information on head of account
hoas = pd.read_csv('hoa_2018.csv', dtype=str)

In [82]:
hoas.columns = hoas.iloc[0]
hoas_processed = hoas.drop('0', axis=1).drop(0)
hoas_processed

Unnamed: 0,demand,demandname,sdept,dept_code,sdeptname,mh,mh_desc,smh,smh_desc,minh,...,dh_desc,sdh,sdh_desc,cv,accts_1516,be_1617,re_1617,be_1718,mhtype,hoa
1,,,,,,0020,Corporation tax,00,Not Applicable,901,...,Not Applicable,000,Not Applicable,V,68897100,77293400,80499006,85837400,1,0020009010001000000NVN
2,,,,,,0021,Taxes on Income Other Than Corporation Tax,00,Not Applicable,901,...,Not Applicable,000,Not Applicable,V,47920700,59900200,60842567,75044200,1,0021009010001000000NVN
3,,,,,,0028,Other Taxes on Income and Expenditure,00,Not Applicable,107,...,Not Applicable,000,Not Applicable,V,2229564,3444158,3516485,3691306,1,0028001070001000000NVN
4,,,,,,0028,Other Taxes on Income and Expenditure,00,Not Applicable,107,...,Not Applicable,000,Not Applicable,V,123089,0,0,203788,1,0028001070003000000NVN
5,,,,,,0028,Other Taxes on Income and Expenditure,00,Not Applicable,107,...,Not Applicable,000,Not Applicable,V,150511,0,0,249188,1,0028001070004000000NVN
6,,,,,,0028,Other Taxes on Income and Expenditure,00,Not Applicable,901,...,Not Applicable,000,Not Applicable,V,1500,0,0,2484,1,0028009010001000000NVN
7,,,,,,0029,Land Revenue,00,Not Applicable,101,...,Not Applicable,000,Not Applicable,V,129171,0,0,0,1,0029001010001000000NVN
8,,,,,,0029,Land Revenue,00,Not Applicable,101,...,Not Applicable,000,Not Applicable,V,294,0,0,0,1,0029001010002000000NVN
9,,,,,,0029,Land Revenue,00,Not Applicable,103,...,Not Applicable,000,Not Applicable,V,493,0,0,0,1,0029001030001000000NVN
10,,,,,,0029,Land Revenue,00,Not Applicable,104,...,Not Applicable,000,Not Applicable,V,138,2590,2590,1231,1,0029001040001000000NVN


In [83]:
processed_data.columns

Index(['AMOUNT', 'DH', 'DISTRICT', 'GSH', 'MH', 'MH TYPE', 'MINH', 'MONTH',
       'NPN', 'SDH', 'SH', 'SMH'],
      dtype='object')

In [84]:
hoas_group_cols = [key.lower() for key in  fill_config.keys()]
hoas_group_cols

['mh', 'smh', 'minh', 'gsh', 'sh', 'dh', 'sdh']

In [85]:
hoas_unique = hoas_processed.drop_duplicates('hoa')

In [86]:
data_group_cols = fill_config.keys()
data_group_cols

dict_keys(['MH', 'SMH', 'MINH', 'GSH', 'SH', 'DH', 'SDH'])

In [87]:
hoas_selected_cols = [
 'mh',
 'mh_desc',
 'smh',
 'smh_desc',
 'minh',
 'minh_desc',
 'gsh',
 'gsh_desc',
 'sh',
 'sh_desc',
 'dh',
 'dh_desc',
 'sdh',
 'sdh_desc',
 'cv',
 'mhtype',
 'hoa']

In [88]:
combined_data = pd.merge(left=processed_data, left_on=list(data_group_cols), 
         right=hoas_unique[hoas_selected_cols], right_on=list(hoas_group_cols),
         how='left')

In [89]:
hoas_group_cols

['mh', 'smh', 'minh', 'gsh', 'sh', 'dh', 'sdh']

In [90]:
combined_data.drop(hoas_group_cols, axis=1, inplace=True)

In [91]:
combined_data.to_csv('../csvs/ditrict_treasuries/ap/2016-17/krishna.csv', index=False, quoting=csv.QUOTE_ALL)

In [38]:
combined_data[combined_data.MH == '2202'].to_csv('Major_head_2202_Krishna_monthly.csv', quoting=csv.QUOTE_ALL, index=False)

In [35]:
combined_data[combined_data.MH == '4602'].to_csv('Major_head_4602_Krishna_monthly.csv', index=False)
combined_data[combined_data.MH == '6202'].to_csv('Major_head_6202_Krishna_monthly.csv', index=False)

In [37]:
combined_data.to_csv('ap_monthly_treasury_data.csv', quoting=csv.QUOTE_ALL)