In [3]:
from pathlib import Path
import pandas as pd
import json

# Aggregated Data Transactions

In [4]:
def Data_Transactions():
    root_dir = Path("../data/aggregated/transaction/country/india/state/")
    lis = []
    # rglob is recursive glob - which means searching files recursiverly inside the files and folders
    for i in root_dir.rglob(f"*.json"):
        with open(i, 'r') as file:
            content = file.read()
            dataset = json.loads(content)

            # i will have exact path from that <.parent> is going one step behind and <.name> is getting that subdirectory name
            state_name = i.parent.parent.name
            year = i.parent.name

            # to get the filename 1.json,2.json, 3.json and 4.json 
            quarter = i.stem
            # to change 1.json to Q1, 2.json to Q2 etc
            quarter = f'Q{quarter}'

            lis.append({'quarter':quarter,'year':year,'state':state_name,'data':dataset})

    # print(lis)

    lis2=[]
    for j in lis:
        
        for k in j['data']['data']['transactionData']:
            # I'm doing conversion rounding to two decimal point
            amount_value = k['paymentInstruments'][0]['amount']
            amount_formatted = f'{amount_value: .2f}'
            
            data = dict(quarter = j['quarter'],
                        year = j['year'],
                        state = j['state'],
                        name = k['name'],
                        type = k['paymentInstruments'][0]['type'],
                        count = k['paymentInstruments'][0]['count'],
                        amount = amount_formatted,
                        timestamp = j['data']['responseTimestamp'])
            lis2.append(data)

    return lis2
    

In [5]:
dataTransaction = pd.DataFrame(Data_Transactions())
dataTransaction

Unnamed: 0,quarter,year,state,name,type,count,amount,timestamp
0,Q1,2018,andaman-&-nicobar-islands,Recharge & bill payments,TOTAL,4200,1845307.47,1630501487199
1,Q1,2018,andaman-&-nicobar-islands,Peer-to-peer payments,TOTAL,1871,12138655.30,1630501487199
2,Q1,2018,andaman-&-nicobar-islands,Merchant payments,TOTAL,298,452507.17,1630501487199
3,Q1,2018,andaman-&-nicobar-islands,Financial Services,TOTAL,33,10601.42,1630501487199
4,Q1,2018,andaman-&-nicobar-islands,Others,TOTAL,256,184689.87,1630501487199
...,...,...,...,...,...,...,...,...
3949,Q2,2023,west-bengal,Merchant payments,TOTAL,245111000,176704613449.00,1692619257660
3950,Q2,2023,west-bengal,Peer-to-peer payments,TOTAL,240347041,797054798425.00,1692619257660
3951,Q2,2023,west-bengal,Recharge & bill payments,TOTAL,58950434,34789238757.00,1692619257660
3952,Q2,2023,west-bengal,Financial Services,TOTAL,327537,317467007.00,1692619257660


# Aggregated Data Users

In [6]:
def Data_Users():
    root_dir1 = Path("../data/aggregated/user/country/india/state")

    lis = []
    for i in root_dir1.rglob("*.json"):
        # print(i)
        with open(i, 'r') as file:
            content = file.read()
            dataset = json.loads(content)

            state = i.parent.parent.name
            year = i.parent.name
            #  to get 1.josn,2.json,3.json and 4.json
            quarter = i.stem
            #  to convert 1.json to Q1, 2.json to Q2 etc.,
            quarter = f'Q{quarter}'

            lis.append({'quarter':quarter, 'year': year, 'state' : state, 'data': dataset})

    # print(lis)

    lis2 = []
    for j in lis:
        users_by_device = j['data']['data']['usersByDevice']
        if users_by_device is None:
            continue  # this will skip if there is no data
        
        for k in users_by_device:
            # percentage format
            percentage_value = k.get('percentage', None)
            percentage_formated = f'{percentage_value: .2f}'

            data = dict(
                        quarter = j['quarter'],
                        year = j['year'],
                        state =j['state'],
                        registeredUsers = j['data']['data']['aggregated'].get('registeredUsers', None),
                        appOpens = j['data']['data']['aggregated'].get('appOpens', None),
                        brand = k.get('brand', None),
                        count = k.get('count', None),
                        percentage = percentage_formated,
                        timestamp = j['data']['responseTimestamp']
                        )
            lis2.append(data)

    return lis2


In [7]:
user = pd.DataFrame(Data_Users())
user.head(30)

Unnamed: 0,quarter,year,state,registeredUsers,appOpens,brand,count,percentage,timestamp
0,Q1,2018,andaman-&-nicobar-islands,6740,0,Xiaomi,1665,0.25,1630501494543
1,Q1,2018,andaman-&-nicobar-islands,6740,0,Samsung,1445,0.21,1630501494543
2,Q1,2018,andaman-&-nicobar-islands,6740,0,Vivo,982,0.15,1630501494543
3,Q1,2018,andaman-&-nicobar-islands,6740,0,Oppo,501,0.07,1630501494543
4,Q1,2018,andaman-&-nicobar-islands,6740,0,OnePlus,332,0.05,1630501494543
5,Q1,2018,andaman-&-nicobar-islands,6740,0,Realme,316,0.05,1630501494543
6,Q1,2018,andaman-&-nicobar-islands,6740,0,Apple,229,0.03,1630501494543
7,Q1,2018,andaman-&-nicobar-islands,6740,0,Motorola,226,0.03,1630501494543
8,Q1,2018,andaman-&-nicobar-islands,6740,0,Lenovo,202,0.03,1630501494543
9,Q1,2018,andaman-&-nicobar-islands,6740,0,Huawei,158,0.02,1630501494543
