In [1]:
import sys
import os
import gzip
import numpy as np
import pandas as pd
from collections import defaultdict

#from pylib.base.flags import Flags

def month_diff(b,a):
    '''month differnece between two pandas datetime objects'''
    return (b.month - a.month + (b.year - a.year)*12)

def pq_lookup(date):
    '''retun period, quarter given a date: pd.datetime(year,month,day)'''
    #3 month index to period quarter
    quarter_map={0:(0,0), 1:(1,1),2:(1,2),3:(2,3),4:(2,4),5:(3,1),6:(3,2),7:(4,3),8:(4,4),9:(5,1),10:(5,2),11:(6,3),12:(6,4)}
    qnum=1+int(np.floor(month_diff(date,pd.datetime(2016,4,1))/3))
    try:
        p,q=quarter_map[qnum]
    except KeyError:
        p,q=(0,0)
    return p,q,qnum

def add_one_month(Date):
    if Date.month < 12:
        year=Date.year
        month=Date.month+1
    else:
        year=Date.year+1
        month=1
    return pd.datetime(year,month,1)

def calculation_pandas(filename):
    # read the entire file into a python array
    #use: with open('file path','rb') as f #without gzip
    with gzip.open(filename, 'rb') as f:
        data = f.readlines()

    # remove the trailing "\n" from each line
    data = map(lambda x: x.rstrip(), data)
    data_json_str = "[" + ','.join(data) + "]"

    # now, load it into pandas
    DF = pd.read_json(data_json_str)
    #convert date:
    DF['Date'] = pd.to_datetime(DF['Real_Date'])
    DF['qNum'] = [pq_lookup(date)[2] for date in DF['Date'].dt.values]
    
    DF_targets_monthly = interpolate_targets(DF.loc[DF['field'].str.contains('target')])
    DF_no_targets = DF.loc[~DF['field'].str.contains('target')]
    #create a data frame with monthly targets
    DF = DF_no_targets.append(DF_targets_monthly)
    
    #Now do the calculations, only considering quarter 1 and 2 #hardcoded
    qcut= (DF['qNum'] > 0) & (DF['qNum'] <= 2)

    DF_temp = DF.loc[(DF['field'].str.contains('target')) & (qcut)]
    DF_temp['field'].replace(regex=True,inplace=True,to_replace=r'target_',value=r'')
    target_values=DF_temp.groupby(['Real_Date', 'field', 'subrecipient']).val.sum().dropna()

    name='spend'
    DF_temp = DF.loc[(DF['field'].str.contains('_number_of')) & (DF['field'].str.contains(name)) & (qcut)]
    DF_temp['field'].replace(regex=True,inplace=True,to_replace=name+r'_',value=r'')
    spend_values=DF_temp.groupby(['Real_Date', 'field', 'subrecipient']).val.sum().dropna()

    name='budget'
    DF_temp = DF.loc[(DF['field'].str.contains('_number_of')) & (DF['field'].str.contains(name)) & (qcut)]
    DF_temp['field'].replace(regex=True,inplace=True,to_replace=name+r'_',value=r'')
    budget_values=DF_temp.groupby(['Real_Date', 'field', 'subrecipient']).val.sum().dropna()

    #non-unique indicators
    indicator_values = DF.loc[(DF['field'].str.contains('_number_of')) & ~(DF['field'].str.contains('target')) &\
        ~(DF['field'].str.contains('spend')) & ~(DF['field'].str.contains('budget')) \
        & ~(DF['field'].str.contains('total_other')) & (qcut)]\
        .groupby(['Real_Date', 'field', 'subrecipient']).val.sum().dropna()

    ind2tar = calculate_ratio(indicator_values, target_values, 'ind_tar')
    spend2bud = calculate_ratio(spend_values, budget_values, 'spe_bud')
    efficieny = calculate_efficieny(indicator_values, target_values, spend_values, budget_values)

    '''Drop duplicates and find uniques'''
    #DF_dropped = DF.dropna(subset=['unique_id']).drop_duplicates(['field','subrecipient','province','qNum','unique_id'])
    
    '''write the new data frame to .json.gz'''
    tempfile = filename.split('za_integration')[0]+'v2_za_integration'\
        +filename.split('za_integration')[1]
    f = gzip.open(tempfile, "w")
    '''
    for row in DF.iterrows():
        row[1].to_json(f)
        f.write('\n')
    '''
    '''write the computed values [Real_Date, field, subrecipient, val]'''
    for row in pd.concat([ind2tar, spend2bud, efficieny]).reset_index().iterrows():
        row[1].to_json(f)
        f.write('\n')
    #need to figure out a way to ignore the index.
    f.close()

def calculate_ratio(DF_Num, DF_Denom, prefix):
    data_ratio = (DF_Num / DF_Denom).replace(np.inf,np.nan).dropna().reset_index()
    data_ratio['field'] = [prefix+'_'+val for val in data_ratio['field']]
    return data_ratio
    
def calculate_efficieny(a_num, a_denom, b_num, b_denom, prefix='efficiency'):
    efficiency = (a_num / a_denom).dropna() / (b_num / b_denom).dropna()
    efficiency = efficiency.replace(np.inf,np.nan).dropna().reset_index()
    efficiency['field'] = [prefix+'_'+val for val in efficiency['field']]
    return efficiency
    
def interpolate_targets(DF):
    '''For the sake of time, this is a little hacked'''
    new_target_date=[]
    new_target_date2=[]
    new_target_date3=[]

    for row in DF.iterrows():
        row= row[1].to_dict()
        row['val'] = row['val'] / 3
        Date=row['Date']
        new_target_date.append(row)

    for row2 in DF.iterrows():
        #add one month
        row2= row2[1].to_dict()
        row2['val'] = row2['val'] / 3
        Date=row2['Date']

        Date=add_one_month(Date)
        row2['Real_Date']= Date.strftime('%Y-%m-%d')
        new_target_date2.append(row2)

    for row3 in DF.iterrows():
        #add one month
        row3= row3[1].to_dict()
        row3['val'] = row3['val'] / 3
        Date=row3['Date']

        Date=add_one_month(add_one_month(Date))
        row3['Real_Date']= Date.strftime('%Y-%m-%d')
        new_target_date3.append(row3)
        
    return pd.DataFrame(new_target_date+new_target_date2+new_target_date3)

In [42]:
calculation_pandas('/Users/attiladobi/zenysis/pipeline/out/za/out/shared/20161213/za_integration.json.gz')