In [1]:
import io, os, uuid, yaml
import numpy as np
import pandas as pd
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

In [2]:
with open("./credentials.yaml","r") as c:
    credentials = yaml.safe_load(c)['blob_storage']

In [3]:
blob_service_client = BlobServiceClient.from_connection_string(credentials['conn_string'])

In [4]:
container_name = "ingredion-data"
container_client = blob_service_client.get_container_client(container_name)

In [5]:
blob_list = container_client.list_blobs()

In [6]:
keys = {'name', "size"}
az_storage_blobs = [{k:v for k,v in b.items() if k in keys} for b in blob_list]

In [7]:
az_storage_blobs

[{'name': 'Brazil_file', 'size': 48885145},
 {'name': 'US_file', 'size': 111936526},
 {'name': 'test_file', 'size': 111936526}]

In [8]:
az_storage_blobs[0]['name']

'Brazil_file'

In [17]:
stream = container_client.download_blob(az_storage_blobs[0]['name'])
result = stream.content_as_text()

In [19]:
data = io.StringIO(result)
df = pd.read_csv(data)

In [20]:
df.shape

(136138, 48)

In [21]:
df.head()

Unnamed: 0,fiscal_year_period,company_name,company_code,Ship_to_party_code,ship_to_party,material,commercial_name,sales_qty_total_mt,3p_sales_qty_total_mt,gross_revenue_usd,...,intercompany_financing_cost,financing_costs,fees_and_royalties,pbt,taxes_on_income,net_income,minority_income,adj_minority_income,total_net_income,ing10000_ingr_net_income
0,1.2016,Ingredion Brasil - I,4101.0,10000010,Ad' oro S. A.,13890001AO,CORN GLUTEN MEAL,109.81,109.81,45367.28,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0
1,1.2016,Ingredion Brasil - I,4101.0,10002289,Duas Rodas Nordeste Ind. de Alim.,12370100BN,PURITY GUM 1773,1.0,1.0,5299.17,...,-65.84,-49.22,37.98,2672.309899,258.81,2413.504772,0,0,2413.504772,2413.504772
2,1.2016,Ingredion Brasil - I,4101.0,10002289,Duas Rodas Nordeste Ind. de Alim.,26010001OB,LIQUID SORBITOL CRYSTALIZABLE NF/FCC,2.24,2.24,2124.1,...,-27.24,-20.36,15.71,1050.54565,107.07,943.479691,0,0,943.479691,943.479691
3,1.2016,Ingredion Brasil - I,4101.0,10002304,Abbott Laboratorios do Brasil Ltda.,13742000NG,LIGHT STEEPWATER,0.48,0.48,130.77,...,-1.56,-1.17,0.9,63.114282,6.14,56.971376,0,0,56.971376,56.971376
4,1.2016,Ingredion Brasil - I,4101.0,10002306,Abc Comercio e Representacoes Ltda.,03401000CE,NATIVE FOOD REGULAR CORN STARCH,12.5,12.5,3940.47,...,-39.99,-29.9,23.07,447.149689,157.21,289.942198,0,0,289.942198,289.942198


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136138 entries, 0 to 136137
Data columns (total 48 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   fiscal_year_period               136138 non-null  float64
 1   company_name                     136138 non-null  object 
 2   company_code                     136138 non-null  float64
 3   Ship_to_party_code               136138 non-null  object 
 4   ship_to_party                    136138 non-null  object 
 5   material                         134121 non-null  object 
 6   commercial_name                  136138 non-null  object 
 7   sales_qty_total_mt               136138 non-null  float64
 8   3p_sales_qty_total_mt            127136 non-null  float64
 9   gross_revenue_usd                136138 non-null  float64
 10  discounts_usd                    136138 non-null  int64  
 11  new_net_revenue                  136138 non-null  float64
 12  n3

In [25]:
df.fiscal_year_period.unique()

array([ 1.2016, 10.2016, 11.2016, 12.2016,  2.2016,  3.2016,  4.2016,
        5.2016,  6.2016,  7.2016,  8.2016,  9.2016,  1.2017, 10.2017,
       11.2017, 12.2017,  2.2017,  3.2017,  4.2017,  5.2017,  6.2017,
        7.2017,  8.2017,  9.2017,  1.2018, 10.2018, 11.2018, 12.2018,
        2.2018,  3.2018,  4.2018,  5.2018,  6.2018,  7.2018,  8.2018,
        9.2018,  1.2019, 10.2019, 11.2019, 12.2019,  2.2019,  3.2019,
        4.2019,  5.2019,  6.2019,  7.2019,  8.2019,  9.2019,  1.202 ,
       10.202 , 11.202 , 12.202 ,  2.202 ,  3.202 ,  4.202 ,  5.202 ,
        6.202 ,  7.202 ,  8.202 ,  9.202 ])

In [26]:
df.nunique()

fiscal_year_period                     60
company_name                            1
company_code                            1
Ship_to_party_code                   3155
ship_to_party                        2786
material                              869
commercial_name                       291
sales_qty_total_mt                  16829
3p_sales_qty_total_mt               16675
gross_revenue_usd                  102234
discounts_usd                           1
new_net_revenue                    102253
n3p_net_revenue                     98349
net_corn                            70028
raw_material_other                  19919
utilities                           55393
waste                                   1
repair                              65281
labor                               49807
ohmfg                               40212
supplies_and_packaging              67617
supplies_indirect                   36121
depreciation                        47716
3p_freight_usd                    