In [56]:
import pandas as pd

In [57]:
#Let's build a dataframe by importing the data we have in the excel that Aditya provided
regionDF = pd.read_excel('Vaccination_4_regions.xlsx')

In [58]:
regionDF

Unnamed: 0,Region,People Vaccinated,Population,% Vaccination Rate
0,APAC,334795720,4446069143,0.075302
1,EMEA,148969083,2292807221,0.064972
2,LATAM,80344017,430457607,0.186648
3,NOAM,204497807,587934386,0.347824
4,Total,768606627,7757268357,0.099082


In [59]:
#It is a bit annoying to read those floats. Let's make the data in the % Vaccination Rate column
#look prettier, for example by adding a colum where we display the vaccination rate * 100
#vr = vx rate * 100

vaccinationRate = regionDF['% Vaccination Rate'] * 100



In [60]:
vaccinationRate

0     7.530151
1     6.497235
2    18.664792
3    34.782420
4     9.908212
Name: % Vaccination Rate, dtype: float64

In [61]:
#The vaccinationRate is a string:
type('vaccinationRate')

str

In [62]:
#Now the question is: how do we add a column with these vax rates displayed right next to the % Vaccination Rate column?
# Use Vax Rate as the column name and equate it to the vaccinationRate string
regionDF['Vax Rate'] = vaccinationRate

In [63]:
regionDF

Unnamed: 0,Region,People Vaccinated,Population,% Vaccination Rate,Vax Rate
0,APAC,334795720,4446069143,0.075302,7.530151
1,EMEA,148969083,2292807221,0.064972,6.497235
2,LATAM,80344017,430457607,0.186648,18.664792
3,NOAM,204497807,587934386,0.347824,34.78242
4,Total,768606627,7757268357,0.099082,9.908212


In [64]:
#The type of my regionDF['Vax Rate'] is series
type(regionDF['Vax Rate'])

pandas.core.series.Series

Now we are ready to learn how to filter data. Data Filtering is one of the most frequent data manipulation operation. 
In terms of speed, filtering dataframes with Pandas is one of the smartest and most efficient ways available to date
to handle big sized datasets.

Data filtering or subsetting data is the prep step for building predictive models or reportsfor the executives.
Learn more here: https://www.listendata.com/2019/07/how-to-filter-pandas-dataframe.html

In [65]:
#Here are a few examples of data filtering. 

#N.1: FILTER PANDAS DATAFRAME BY COLUMN VALUE 

#for ex. filter by columns Region and People Vaccinated - the below will output what region with a population of
#4446069143 has 334795720 vaccinated people

newDF = regionDF[(regionDF['People Vaccinated'] == 334795720) & (regionDF['Population'] == 4446069143)]

In [66]:
newDF

Unnamed: 0,Region,People Vaccinated,Population,% Vaccination Rate,Vax Rate
0,APAC,334795720,4446069143,0.075302,7.530151


In [67]:
# N 2: FILTER PANDAS DATAFRAME BY LOC FUNCTION- Returns the same outpout as above
newDF = regionDF.loc[(regionDF['People Vaccinated']== 334795720) & (regionDF['Population'] == 4446069143)]
#newdf = df.loc[(df.origin == "JFK") & (df.carrier == "B6")]


In [68]:
newDF


Unnamed: 0,Region,People Vaccinated,Population,% Vaccination Rate,Vax Rate
0,APAC,334795720,4446069143,0.075302,7.530151


In [77]:
#N 3 FILTER PANDAS DATAFRAME BY QUERY: only works if there is no spacein the column titles - It returns the same output as above

newDF = regionDF.query('Population == 4446069143') #Works fine



In [78]:
newDF

Unnamed: 0,Region,People Vaccinated,Population,% Vaccination Rate,Vax Rate
0,APAC,334795720,4446069143,0.075302,7.530151


In [79]:
#Note that QUERY doesn't work if there is a space in the column title:
#We get TypeError: 'method' object is not subscriptable
newDF = regionDF.query['People Vaccinated == 334795720']

TypeError: 'method' object is not subscriptable

In [80]:
#Filtering by column values and loc gives the same output: which one is faster? Filtering by columns is faster
import timeit
code = """import pandas as pd
regionDF = pd.read_excel('Vaccination_4_regions.xlsx')
newDF = regionDF[(regionDF['People Vaccinated'] == 334795720) & (regionDF['Population'] == 4446069143)]"""

execution_time = timeit.timeit(code, number=1)

print(execution_time)

0.02362859999993816


In [76]:
import timeit
code = """import pandas as pd
regionDF = pd.read_excel('Vaccination_4_regions.xlsx')
newDF = regionDF.loc[(regionDF['People Vaccinated']== 334795720) & (regionDF['Population'] == 4446069143)]"""

execution_time = timeit.timeit(code, number = 1)

print(execution_time)

0.02308020000009492


In [83]:
#N 4 FILTER PANDAS DATAFRAME BY ROW AND COLUMN POSITION WITH iloc
regionDF.iloc[:3,] #First 3 rows

Unnamed: 0,Region,People Vaccinated,Population,% Vaccination Rate,Vax Rate
0,APAC,334795720,4446069143,0.075302,7.530151
1,EMEA,148969083,2292807221,0.064972,6.497235
2,LATAM,80344017,430457607,0.186648,18.664792


In [86]:
regionDF.iloc[1:4,] #From second to fourth row

Unnamed: 0,Region,People Vaccinated,Population,% Vaccination Rate,Vax Rate
1,EMEA,148969083,2292807221,0.064972,6.497235
2,LATAM,80344017,430457607,0.186648,18.664792
3,NOAM,204497807,587934386,0.347824,34.78242


In [88]:
regionDF.iloc[3,0]  #Third row first column

'NOAM'

In [91]:
regionDF.iloc[1:4,0] #Second to fourth row and first column

1     EMEA
2    LATAM
3     NOAM
Name: Region, dtype: object

In [None]:
#For more ways to filter dataframe see https://www.listendata.com/2019/07/how-to-filter-pandas-dataframe.html

In [None]:
#Let's now focus on a method that can be really helpful: pandas.DataFrame.to_json

Learn and apply pandas.DataFrame.to_json to convert an object to a JSON string.

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html#pandas-dataframe-to-json

This is the synthax:

DataFrame.to_json(path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False, compression='infer', index=True, indent=None, storage_options=None)

In [101]:
import json

In [102]:
#path_or_buf: file path or object. If not specified it returns a string.
regionDF.to_json(path_or_buf=None)

'{"Region":{"0":"APAC","1":"EMEA","2":"LATAM","3":"NOAM","4":"Total "},"People Vaccinated":{"0":334795720,"1":148969083,"2":80344017,"3":204497807,"4":768606627},"Population":{"0":4446069143,"1":2292807221,"2":430457607,"3":587934386,"4":7757268357},"% Vaccination Rate":{"0":0.07530151,"1":0.0649723543,"2":0.18664792,"3":0.3478241992,"4":0.0990821242},"Vax Rate":{"0":7.5301509993,"1":6.4972354254,"2":18.6647920012,"3":34.7824199213,"4":9.9082124226}}'

In [103]:
# default is ‘index’, which reads the data horizontally for each index:
#allowed values are: {‘split’, ‘records’, ‘index’, ‘table’}.
    
regionDF.to_json(path_or_buf=None, orient= 'index')

'{"0":{"Region":"APAC","People Vaccinated":334795720,"Population":4446069143,"% Vaccination Rate":0.07530151,"Vax Rate":7.5301509993},"1":{"Region":"EMEA","People Vaccinated":148969083,"Population":2292807221,"% Vaccination Rate":0.0649723543,"Vax Rate":6.4972354254},"2":{"Region":"LATAM","People Vaccinated":80344017,"Population":430457607,"% Vaccination Rate":0.18664792,"Vax Rate":18.6647920012},"3":{"Region":"NOAM","People Vaccinated":204497807,"Population":587934386,"% Vaccination Rate":0.3478241992,"Vax Rate":34.7824199213},"4":{"Region":"Total ","People Vaccinated":768606627,"Population":7757268357,"% Vaccination Rate":0.0990821242,"Vax Rate":9.9082124226}}'

In [104]:
#Split reads horizontally (the names of the columns), vertically (the indeces), and provides the data for each region
regionDF.to_json(path_or_buf = None, orient = 'split')

'{"columns":["Region","People Vaccinated","Population","% Vaccination Rate","Vax Rate"],"index":[0,1,2,3,4],"data":[["APAC",334795720,4446069143,0.07530151,7.5301509993],["EMEA",148969083,2292807221,0.0649723543,6.4972354254],["LATAM",80344017,430457607,0.18664792,18.6647920012],["NOAM",204497807,587934386,0.3478241992,34.7824199213],["Total ",768606627,7757268357,0.0990821242,9.9082124226]]}'

In [105]:
#records reads the data for every region right away
regionDF.to_json(path_or_buf = None, orient = 'records')

'[{"Region":"APAC","People Vaccinated":334795720,"Population":4446069143,"% Vaccination Rate":0.07530151,"Vax Rate":7.5301509993},{"Region":"EMEA","People Vaccinated":148969083,"Population":2292807221,"% Vaccination Rate":0.0649723543,"Vax Rate":6.4972354254},{"Region":"LATAM","People Vaccinated":80344017,"Population":430457607,"% Vaccination Rate":0.18664792,"Vax Rate":18.6647920012},{"Region":"NOAM","People Vaccinated":204497807,"Population":587934386,"% Vaccination Rate":0.3478241992,"Vax Rate":34.7824199213},{"Region":"Total ","People Vaccinated":768606627,"Population":7757268357,"% Vaccination Rate":0.0990821242,"Vax Rate":9.9082124226}]'

In [106]:
#table gives tons of information on the data type as well
regionDF.to_json(path_or_buf = None, orient = 'table')

'{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"Region","type":"string"},{"name":"People Vaccinated","type":"integer"},{"name":"Population","type":"integer"},{"name":"% Vaccination Rate","type":"number"},{"name":"Vax Rate","type":"number"}],"primaryKey":["index"],"pandas_version":"0.20.0"},"data":[{"index":0,"Region":"APAC","People Vaccinated":334795720,"Population":4446069143,"% Vaccination Rate":0.07530151,"Vax Rate":7.5301509993},{"index":1,"Region":"EMEA","People Vaccinated":148969083,"Population":2292807221,"% Vaccination Rate":0.0649723543,"Vax Rate":6.4972354254},{"index":2,"Region":"LATAM","People Vaccinated":80344017,"Population":430457607,"% Vaccination Rate":0.18664792,"Vax Rate":18.6647920012},{"index":3,"Region":"NOAM","People Vaccinated":204497807,"Population":587934386,"% Vaccination Rate":0.3478241992,"Vax Rate":34.7824199213},{"index":4,"Region":"Total ","People Vaccinated":768606627,"Population":7757268357,"% Vaccination Rate":0.0990821242,"Vax Rate":

In [107]:
#columns reads vertically for every column, ignoring the index column
regionDF.to_json(orient = 'columns')

'{"Region":{"0":"APAC","1":"EMEA","2":"LATAM","3":"NOAM","4":"Total "},"People Vaccinated":{"0":334795720,"1":148969083,"2":80344017,"3":204497807,"4":768606627},"Population":{"0":4446069143,"1":2292807221,"2":430457607,"3":587934386,"4":7757268357},"% Vaccination Rate":{"0":0.07530151,"1":0.0649723543,"2":0.18664792,"3":0.3478241992,"4":0.0990821242},"Vax Rate":{"0":7.5301509993,"1":6.4972354254,"2":18.6647920012,"3":34.7824199213,"4":9.9082124226}}'

In [108]:
#values reads horizontally: all the horizontal data for the first region, then all the horizontal data for the second region
#etc.
regionDF.to_json(orient = 'values')

'[["APAC",334795720,4446069143,0.07530151,7.5301509993],["EMEA",148969083,2292807221,0.0649723543,6.4972354254],["LATAM",80344017,430457607,0.18664792,18.6647920012],["NOAM",204497807,587934386,0.3478241992,34.7824199213],["Total ",768606627,7757268357,0.0990821242,9.9082124226]]'

In [109]:
#Let's focus on this for a moment regionDF.to_json(orient = 'records'). Let's assign it to a variable my_string
my_string = regionDF.to_json(orient = 'records')

In [110]:
my_string

'[{"Region":"APAC","People Vaccinated":334795720,"Population":4446069143,"% Vaccination Rate":0.07530151,"Vax Rate":7.5301509993},{"Region":"EMEA","People Vaccinated":148969083,"Population":2292807221,"% Vaccination Rate":0.0649723543,"Vax Rate":6.4972354254},{"Region":"LATAM","People Vaccinated":80344017,"Population":430457607,"% Vaccination Rate":0.18664792,"Vax Rate":18.6647920012},{"Region":"NOAM","People Vaccinated":204497807,"Population":587934386,"% Vaccination Rate":0.3478241992,"Vax Rate":34.7824199213},{"Region":"Total ","People Vaccinated":768606627,"Population":7757268357,"% Vaccination Rate":0.0990821242,"Vax Rate":9.9082124226}]'

LOADS & DUMPS

DUMPS is used when the objects are required to be in string format 
LOADS is used when the objects are required to be in list format 

In [131]:
#Since I'm delaing with a string, I can: 
my_json = json.loads(my_string) #send my string to a list
#my_json = json.dumps(my_string)

In [134]:
my_json

[{'Region': 'APAC',
  'People Vaccinated': 334795720,
  'Population': 4446069143,
  '% Vaccination Rate': 0.07530151,
  'Vax Rate': 7.5301509993},
 {'Region': 'EMEA',
  'People Vaccinated': 148969083,
  'Population': 2292807221,
  '% Vaccination Rate': 0.0649723543,
  'Vax Rate': 6.4972354254},
 {'Region': 'LATAM',
  'People Vaccinated': 80344017,
  'Population': 430457607,
  '% Vaccination Rate': 0.18664792,
  'Vax Rate': 18.6647920012},
 {'Region': 'NOAM',
  'People Vaccinated': 204497807,
  'Population': 587934386,
  '% Vaccination Rate': 0.3478241992,
  'Vax Rate': 34.7824199213},
 {'Region': 'Total ',
  'People Vaccinated': 768606627,
  'Population': 7757268357,
  '% Vaccination Rate': 0.0990821242,
  'Vax Rate': 9.9082124226}]

In [135]:
my_json = json.dumps(my_string) #send my string to a string

In [140]:
my_json

'"[{\\"Region\\":\\"APAC\\",\\"People Vaccinated\\":334795720,\\"Population\\":4446069143,\\"% Vaccination Rate\\":0.07530151,\\"Vax Rate\\":7.5301509993},{\\"Region\\":\\"EMEA\\",\\"People Vaccinated\\":148969083,\\"Population\\":2292807221,\\"% Vaccination Rate\\":0.0649723543,\\"Vax Rate\\":6.4972354254},{\\"Region\\":\\"LATAM\\",\\"People Vaccinated\\":80344017,\\"Population\\":430457607,\\"% Vaccination Rate\\":0.18664792,\\"Vax Rate\\":18.6647920012},{\\"Region\\":\\"NOAM\\",\\"People Vaccinated\\":204497807,\\"Population\\":587934386,\\"% Vaccination Rate\\":0.3478241992,\\"Vax Rate\\":34.7824199213},{\\"Region\\":\\"Total \\",\\"People Vaccinated\\":768606627,\\"Population\\":7757268357,\\"% Vaccination Rate\\":0.0990821242,\\"Vax Rate\\":9.9082124226}]"'

LOAD & DUMP
https://www.youtube.com/watch?v=9N6a-VLBa2I
https://www.geeksforgeeks.org/json-dump-in-python/

DUMP: convert a python object into a json object. It's used when the Python objects have to be stored in a file.

In [150]:
# DUMP, DUMPS and LOAD: https://www.geeksforgeeks.org/json-dump-in-python/
#The dump() needs the json file name in which the output has to be stored as an argument.
#Syntax: json.dump(d, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None)
## the json file where the output must be stored
with open("my_vax.json", "w") as p:
    json.dump(my_json, p, indent = 4, sort_keys = True)

In [151]:
with open('my_vax.json', 'r') as p:
    json.load(p)

json.dump()

json module in Python module provides a method called dump() which converts the Python objects into appropriate
json objects. It is a slight variant of dumps() method, which is 2 times slower.

The dump() method is used when the Python objects have to be stored in a file.

The dumps() is used when the objects are required to be in string format and is used for parsing, printing, etc.

The dump() needs the json file name in which the output has to be stored as an argument.

The dumps() does not require any such file name to be passed.

This method writes in the memory and then command for writing to disk is executed separately

This method directly writes to the json file

In [107]:
#The dumps() is used when the objects are required to be in string format and is used for parsing, printing, etc.
json.dumps(my_json)

'[{"Region": "APAC", "People Vaccinated": 334795720, "Population": 4446069143, "% Vaccination Rate": 0.07530151, "Vax Rate": 7.5301509993}, {"Region": "EMEA", "People Vaccinated": 148969083, "Population": 2292807221, "% Vaccination Rate": 0.0649723543, "Vax Rate": 6.4972354254}, {"Region": "LATAM", "People Vaccinated": 80344017, "Population": 430457607, "% Vaccination Rate": 0.18664792, "Vax Rate": 18.6647920012}, {"Region": "NOAM", "People Vaccinated": 204497807, "Population": 587934386, "% Vaccination Rate": 0.3478241992, "Vax Rate": 34.7824199213}, {"Region": "Total ", "People Vaccinated": 768606627, "Population": 7757268357, "% Vaccination Rate": 0.0990821242, "Vax Rate": 9.9082124226}]'

In [108]:
json.load(my_json)

AttributeError: 'list' object has no attribute 'read'

In [None]:
#[float(x) for x in vr]

In [None]:
#2. QUERY FUNCTION
newDF = regionDF.query('People Vaccinated == 334795720 & Population == 4446069143')

#newdf = df.query('origin == "JFK" & carrier == "B6"')

In [None]:
#NOTE: let's get away from the habit of building dictionaries manually like I used to do below.
#Best practice is to use read_excel if I already have the data into an excel doc

regionData = {
    "Region":
                ["APAC", "EMEA ", "LATAM", "NOAM" ],
    
    "People Vaccinated": 
               [334795720, 148969083, 80344017, 204497807],
                                      
    "Population": 
              [4446069143, 2292807221, 430457607, 587934386],
                                    
    "% Vaccination Rate":
               [7.53, 6.50, 18.66, 34.78],
    
}