## Assignment 07

<br>Name: Anjani Bonda
<br>Date: 04-28-2023

#### Assignment 7.1.a

In [1]:
# Load required libraries
import os
import json
from pathlib import Path
import gzip
import hashlib
import shutil
import pandas as pd
import pygeohash
import s3fs
import uuid
import math

#### Load route dataset

In [3]:
endpoint_url='https://storage.budsc.midwest-datascience.com'
curr_dir = Path(os.getcwd()).absolute()
results_dir = curr_dir.joinpath('results')

if results_dir.exists():
    shutil.rmtree(results_dir)
results_dir.mkdir(parents=True, exist_ok=True)

In [4]:
## read_jsonl_data function to process the json file

def read_jsonl_data():
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    with s3.open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            records = [json.loads(line) for line in f.readlines()]       
    return records

def read_jsonl_data_local():
    '''Create function to read file from local'''
    src_data_path = '/home/jovyan/dsc650/data/processed/openflights/routes.jsonl.gz'
    with open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            records = [json.loads(line) for line in f.readlines()]       
    return records

In [5]:
## Flatten the dataset
def flatten_record(record):
    flat_record = dict()
    for key, value in record.items():
        if key in ['airline', 'src_airport', 'dst_airport']:
            if isinstance(value, dict):
                for child_key, child_value in value.items():
                    flat_key = '{}_{}'.format(key, child_key)
                    flat_record[flat_key] = child_value
        else:
            flat_record[key] = value   
    return flat_record

def create_flattened_dataset():
    records = read_jsonl_data_local()
    parquet_path = results_dir.joinpath('routes-flattened.parquet')
    return pd.DataFrame.from_records([flatten_record(record) for record in records])

In [6]:
## Create df and the key field 
df = create_flattened_dataset()
df['key'] = df['src_airport_iata'].astype(str) + df['dst_airport_iata'].astype(str) + df['airline_iata'].astype(str)

FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/dsc650/data/processed/openflights/routes.jsonl.gz'

In [10]:
## Check sample records from dataframe
df.head()

Unnamed: 0,airline_airline_id,airline_name,airline_alias,airline_iata,airline_icao,airline_callsign,airline_country,airline_active,src_airport_airport_id,src_airport_name,...,dst_airport_longitude,dst_airport_altitude,dst_airport_timezone,dst_airport_dst,dst_airport_tz_id,dst_airport_type,dst_airport_source,codeshare,equipment,key
0,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2965.0,Sochi International Airport,...,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,False,[CR2],AERKZN2B
1,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,False,[CR2],ASFKZN2B
2,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,43.081902,1054.0,3.0,N,Europe/Moscow,airport,OurAirports,False,[CR2],ASFMRV2B
3,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,49.278702,411.0,3.0,N,Europe/Moscow,airport,OurAirports,False,[CR2],CEKKZN2B
4,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,82.650703,365.0,7.0,N,Asia/Krasnoyarsk,airport,OurAirports,False,[CR2],CEKOVB2B


Here, we will create 16 partitions to compare to the partitions we create from hashed keys in the next part of the assignment. The partitions will be determined by the 1st letter of the composite key using the below partitions.

In [11]:
## set Partitions 
partitions = (
        ('A', 'A'), ('B', 'B'), ('C', 'D'), ('E', 'F'),
        ('G', 'H'), ('I', 'J'), ('K', 'L'), ('M', 'M'),
        ('N', 'N'), ('O', 'P'), ('Q', 'R'), ('S', 'T'),
        ('U', 'U'), ('V', 'V'), ('W', 'X'), ('Y', 'Z')
        )

Here,('A', 'A') means the folder should contain all of the flight routes whose key starts with A. 
The results/kv directory contains below folders.

In [12]:
# kv
# ├── kv_key=A
# ├── kv_key=B
# ├── kv_key=C-D
# ├── kv_key=E-F
# ├── kv_key=G-H
# ├── kv_key=I-J
# ├── kv_key=K-L
# ├── kv_key=M
# ├── kv_key=N
# ├── kv_key=O-P
# ├── kv_key=Q-R
# ├── kv_key=S-T
# ├── kv_key=U
# ├── kv_key=V
# ├── kv_key=W-X
# └── kv_key=Y-Z

In [13]:
# define dictionary of partitions and kv_keys
partition_dict = {}
for i in partitions:
    if i[0] == i[1]:
        partition_dict[i] = i[0]
    else:
        partition_dict[i] = i[0] + '-' + i[1] 

In [14]:
## Print partition_dict
partition_dict

{('A', 'A'): 'A',
 ('B', 'B'): 'B',
 ('C', 'D'): 'C-D',
 ('E', 'F'): 'E-F',
 ('G', 'H'): 'G-H',
 ('I', 'J'): 'I-J',
 ('K', 'L'): 'K-L',
 ('M', 'M'): 'M',
 ('N', 'N'): 'N',
 ('O', 'P'): 'O-P',
 ('Q', 'R'): 'Q-R',
 ('S', 'T'): 'S-T',
 ('U', 'U'): 'U',
 ('V', 'V'): 'V',
 ('W', 'X'): 'W-X',
 ('Y', 'Z'): 'Y-Z'}

In [15]:
# Generate kv_key from key
def kv_key_gen(data_key):
    for key, val in partition_dict.items():
        if data_key[0] == key[0] or data_key[0] == key[1]:
            return val
    return None

In [16]:
# Add kv_key column to df
df['kv_key'] = df['key'].apply(kv_key_gen)

In [17]:
## Showing sample records from dataframe
df.head()

Unnamed: 0,airline_airline_id,airline_name,airline_alias,airline_iata,airline_icao,airline_callsign,airline_country,airline_active,src_airport_airport_id,src_airport_name,...,dst_airport_altitude,dst_airport_timezone,dst_airport_dst,dst_airport_tz_id,dst_airport_type,dst_airport_source,codeshare,equipment,key,kv_key
0,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2965.0,Sochi International Airport,...,411.0,3.0,N,Europe/Moscow,airport,OurAirports,False,[CR2],AERKZN2B,A
1,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,411.0,3.0,N,Europe/Moscow,airport,OurAirports,False,[CR2],ASFKZN2B,A
2,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,1054.0,3.0,N,Europe/Moscow,airport,OurAirports,False,[CR2],ASFMRV2B,A
3,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,411.0,3.0,N,Europe/Moscow,airport,OurAirports,False,[CR2],CEKKZN2B,C-D
4,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,365.0,7.0,N,Asia/Krasnoyarsk,airport,OurAirports,False,[CR2],CEKOVB2B,C-D


In [22]:
## Showing key and key-value from the dataframe
df[['key', 'kv_key']]

Unnamed: 0,key,kv_key
0,AERKZN2B,A
1,ASFKZN2B,A
2,ASFMRV2B,A
3,CEKKZN2B,C-D
4,CEKOVB2B,C-D
...,...,...
67658,WYAADLZL,W-X
67659,DMEFRUZM,C-D
67660,FRUDMEZM,E-F
67661,FRUOSSZM,E-F


In [24]:
# Saving the dataframe in parquet format using kv_keys
try:
    df.to_parquet(results_dir.joinpath('kv'), partition_cols=['kv_key'])
except:
    print("The dataframe write operation has been failed")
else:
    print("The dataframe write operation to create partition is successful")

The dataframe write operation to create partition is successful


#### Assignment 7.1.b

Next, we are going to partition the dataset again, but this time we will partition by the hash value of the key. The following is a function that will create a SHA256 hash of the input key and return a hexadecimal string representation of the hash.

In [25]:
import hashlib

def hash_key(key):
    m = hashlib.sha256()
    m.update(str(key).encode('utf-8'))
    return m.hexdigest()

We will partition the data using the first character of the hexadecimal hash. As such, there are 16 possible partitions. Create a new column called hashed that is a hashed value of the key column. Next, create a partitioned dataset based on the first character of the hashed key and save the results to results/hash. The directory should contain the following folders.

In [26]:
# hash
# ├── hash_key=0
# ├── hash_key=1
# ├── hash_key=2
# ├── hash_key=3
# ├── hash_key=4
# ├── hash_key=5
# ├── hash_key=6
# ├── hash_key=7
# ├── hash_key=8
# ├── hash_key=9
# ├── hash_key=A
# ├── hash_key=B
# ├── hash_key=C
# ├── hash_key=D
# ├── hash_key=E

In [27]:
# Add hash column to df
df['hashed'] = df['key'].apply(hash_key)

In [28]:
# Add hash key column to df for partitioning
df['hash_key'] = df['hashed'].str[0]

In [29]:
## showing few records from dataframe
df.head()

Unnamed: 0,airline_airline_id,airline_name,airline_alias,airline_iata,airline_icao,airline_callsign,airline_country,airline_active,src_airport_airport_id,src_airport_name,...,dst_airport_dst,dst_airport_tz_id,dst_airport_type,dst_airport_source,codeshare,equipment,key,kv_key,hashed,hash_key
0,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2965.0,Sochi International Airport,...,N,Europe/Moscow,airport,OurAirports,False,[CR2],AERKZN2B,A,652cdec02010381f175efe499e070c8cbaac1522bac59a...,6
1,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,N,Europe/Moscow,airport,OurAirports,False,[CR2],ASFKZN2B,A,9eea5dd88177f8d835b2bb9cb27fb01268122b635b241a...,9
2,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,N,Europe/Moscow,airport,OurAirports,False,[CR2],ASFMRV2B,A,161143856af25bd4475f62c80c19f68936a139f653c1d3...,1
3,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,N,Europe/Moscow,airport,OurAirports,False,[CR2],CEKKZN2B,C-D,39aa99e6ae2757341bede9584473906ef1089e30820c90...,3
4,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,N,Asia/Krasnoyarsk,airport,OurAirports,False,[CR2],CEKOVB2B,C-D,143b3389bce68eea3a13ac26a9c76c1fa583ec2bd26ea8...,1


In [31]:
# Partition dataset using hk_hash
try:
    df.to_parquet(results_dir.joinpath('hash'), partition_cols=['hash_key'])
except:
    print("The dataframe write operation to create hash partitions has been failed")
else:
    print("The dataframe write operation to create hash partitions is successful")

The dataframe write operation to create hash partitions is successful


#### Assignment 7.1.c

Finally, we will simulate multiple geographically distributed data centers. For this example, we will assume we have three data centers located in the western, central, and eastern United States. Google lists the locations of their data centers and we will use the following locations for our three data centers.

West
- The Dalles, Oregon
- Latitude: 45.5945645
- Longitude: -121.1786823

Central
- Papillion, NE
- Latitude: 41.1544433
- Longitude: -96.0422378

East
- Loudoun County, Virginia
- Latitude: 39.08344
- Longitude: -77.6497145

Assume that you have an application that provides routes for each of the source airports and you want to store routes in the data center closest to the source airport. The output folders should look as follows.

In [32]:
# geo
# ├── location=central
# ├── location=east
# └── location=west

In [35]:
df.columns

Index(['airline_airline_id', 'airline_name', 'airline_alias', 'airline_iata',
       'airline_icao', 'airline_callsign', 'airline_country', 'airline_active',
       'src_airport_airport_id', 'src_airport_name', 'src_airport_city',
       'src_airport_country', 'src_airport_iata', 'src_airport_icao',
       'src_airport_latitude', 'src_airport_longitude', 'src_airport_altitude',
       'src_airport_timezone', 'src_airport_dst', 'src_airport_tz_id',
       'src_airport_type', 'src_airport_source', 'dst_airport_airport_id',
       'dst_airport_name', 'dst_airport_city', 'dst_airport_country',
       'dst_airport_iata', 'dst_airport_icao', 'dst_airport_latitude',
       'dst_airport_longitude', 'dst_airport_altitude', 'dst_airport_timezone',
       'dst_airport_dst', 'dst_airport_tz_id', 'dst_airport_type',
       'dst_airport_source', 'codeshare', 'equipment', 'key', 'kv_key',
       'hashed', 'hash_key'],
      dtype='object')

In [36]:
## Create a new column to calculare sourrce airport geo value
func = lambda x: pygeohash.encode(x.src_airport_latitude, x.src_airport_longitude)
df['geohash'] = df.apply(func, axis=1)

In [37]:
## Displaying few records from dataframe
df.head()

Unnamed: 0,airline_airline_id,airline_name,airline_alias,airline_iata,airline_icao,airline_callsign,airline_country,airline_active,src_airport_airport_id,src_airport_name,...,dst_airport_tz_id,dst_airport_type,dst_airport_source,codeshare,equipment,key,kv_key,hashed,hash_key,geohash
0,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2965.0,Sochi International Airport,...,Europe/Moscow,airport,OurAirports,False,[CR2],AERKZN2B,A,652cdec02010381f175efe499e070c8cbaac1522bac59a...,6,szsrjjzd02b3
1,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,Europe/Moscow,airport,OurAirports,False,[CR2],ASFKZN2B,A,9eea5dd88177f8d835b2bb9cb27fb01268122b635b241a...,9,v04pk3t5gbjj
2,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,Europe/Moscow,airport,OurAirports,False,[CR2],ASFMRV2B,A,161143856af25bd4475f62c80c19f68936a139f653c1d3...,1,v04pk3t5gbjj
3,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,Europe/Moscow,airport,OurAirports,False,[CR2],CEKKZN2B,C-D,39aa99e6ae2757341bede9584473906ef1089e30820c90...,3,v3gdxs17du83
4,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,Asia/Krasnoyarsk,airport,OurAirports,False,[CR2],CEKOVB2B,C-D,143b3389bce68eea3a13ac26a9c76c1fa583ec2bd26ea8...,1,v3gdxs17du83


In [38]:
## Displaying new column values for few records
df['geohash'].head()

0    szsrjjzd02b3
1    v04pk3t5gbjj
2    v04pk3t5gbjj
3    v3gdxs17du83
4    v3gdxs17du83
Name: geohash, dtype: object

In [40]:
## Defining the datacenters
# Get geohash location info for data centers
data_centers = dict(
    west = pygeohash.encode(45.5945645, -121.1786823),
    central = pygeohash.encode(41.1544433, -96.0422378),
    east = pygeohash.encode(39.08344, -77.6497145)
)
data_centers

{'west': 'c21g6s0rs4c7', 'central': '9z7dnebnj8kb', 'east': 'dqby34cjw922'}

In [41]:
## Create a function to get closest datacenters from source airport
def get_dc_location(geohash):
    
    distance_dict= {}
    
    for key, val in data_centers.items():
        distance_dict[key] = pygeohash.geohash_haversine_distance(val, geohash)
    closest = sorted(distance_dict.items(), key=lambda x: x[1])[0][0]
    return closest

In [42]:
# Add column for closest data center
df['location'] = df['geohash'].apply(get_dc_location)

In [43]:
## Printing few records from dataframe
df.head()

Unnamed: 0,airline_airline_id,airline_name,airline_alias,airline_iata,airline_icao,airline_callsign,airline_country,airline_active,src_airport_airport_id,src_airport_name,...,dst_airport_type,dst_airport_source,codeshare,equipment,key,kv_key,hashed,hash_key,geohash,location
0,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2965.0,Sochi International Airport,...,airport,OurAirports,False,[CR2],AERKZN2B,A,652cdec02010381f175efe499e070c8cbaac1522bac59a...,6,szsrjjzd02b3,east
1,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,airport,OurAirports,False,[CR2],ASFKZN2B,A,9eea5dd88177f8d835b2bb9cb27fb01268122b635b241a...,9,v04pk3t5gbjj,east
2,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,airport,OurAirports,False,[CR2],ASFMRV2B,A,161143856af25bd4475f62c80c19f68936a139f653c1d3...,1,v04pk3t5gbjj,east
3,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,airport,OurAirports,False,[CR2],CEKKZN2B,C-D,39aa99e6ae2757341bede9584473906ef1089e30820c90...,3,v3gdxs17du83,west
4,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,airport,OurAirports,False,[CR2],CEKOVB2B,C-D,143b3389bce68eea3a13ac26a9c76c1fa583ec2bd26ea8...,1,v3gdxs17du83,west


In [44]:
## Printing unique locations
df['location'].unique()

array(['east', 'west', 'central'], dtype=object)

In [46]:
# Partition dataset using closest data center location
try: 
    df.to_parquet(results_dir.joinpath('geo'), partition_cols=['location'])
except:
    print("The dataframe write operation to create geo partition has been failed")
else:
    print("The dataframe write operation to create geo partition is successful")

The dataframe write operation to create geo partition is successful


#### Assignment 7.1.d

Create a Python function that takes as input a list of keys and the number of partitions and returns a list of keys sorted into the specified number of partitions. The partitions should be roughly equal in size. Furthermore, the partitions should have the property that each partition contains all the keys between the least key in the partition and the greatest key in the partition. In other words, the partitions should be ordered.

In [53]:
## We will use itertools library to divide the given list into equal number of sub lists
from itertools import islice

In [59]:
## Function to create balance partitions
## Reference: https://www.geeksforgeeks.org/break-list-chunks-size-n-python/

def balance_partitions(keys, num_partitions):
    
    arr_size = round(len(keys)/num_partitions)
    
    arr_range = iter(keys)
    partitions_iters = iter(lambda: tuple(islice(arr_range, arr_size)), ())
    partitions = [sorted(part) for part in partitions_iters] 
    
    return partitions

In [60]:
## Showing few sample records from dataframe
df.head()

Unnamed: 0,airline_airline_id,airline_name,airline_alias,airline_iata,airline_icao,airline_callsign,airline_country,airline_active,src_airport_airport_id,src_airport_name,...,dst_airport_type,dst_airport_source,codeshare,equipment,key,kv_key,hashed,hash_key,geohash,location
0,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2965.0,Sochi International Airport,...,airport,OurAirports,False,[CR2],AERKZN2B,A,652cdec02010381f175efe499e070c8cbaac1522bac59a...,6,szsrjjzd02b3,east
1,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,airport,OurAirports,False,[CR2],ASFKZN2B,A,9eea5dd88177f8d835b2bb9cb27fb01268122b635b241a...,9,v04pk3t5gbjj,east
2,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2966.0,Astrakhan Airport,...,airport,OurAirports,False,[CR2],ASFMRV2B,A,161143856af25bd4475f62c80c19f68936a139f653c1d3...,1,v04pk3t5gbjj,east
3,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,airport,OurAirports,False,[CR2],CEKKZN2B,C-D,39aa99e6ae2757341bede9584473906ef1089e30820c90...,3,v3gdxs17du83,west
4,410,Aerocondor,ANA All Nippon Airways,2B,ARD,AEROCONDOR,Portugal,True,2968.0,Chelyabinsk Balandino Airport,...,airport,OurAirports,False,[CR2],CEKOVB2B,C-D,143b3389bce68eea3a13ac26a9c76c1fa583ec2bd26ea8...,1,v3gdxs17du83,west


In [64]:
## We will use airline_iata field present in the dataframe to create keys

airlines = df.airline_iata.sample(50).to_list()
airlines

['AD',
 'EY',
 'IB',
 'TO',
 'GA',
 'CZ',
 'AY',
 'KE',
 'MF',
 'KL',
 'TS',
 'G8',
 'WN',
 'EY',
 'U6',
 'TY',
 '5J',
 'W6',
 'AZ',
 'FL',
 'DY',
 'VA',
 'S4',
 'FR',
 'CX',
 'F9',
 'KM',
 'BA',
 'S2',
 'AA',
 'CZ',
 'AF',
 'DY',
 'RO',
 'FR',
 'EY',
 'KL',
 'AD',
 'TG',
 'B6',
 'TP',
 'B6',
 'QF',
 'AF',
 'AR',
 'VY',
 'GS',
 'HX',
 '3U',
 'TK']

In [65]:
## Create 5 partitions by calling balance_partitions
partitions = balance_partitions(airlines, 5)
partitions

[['AD', 'AY', 'CZ', 'EY', 'GA', 'IB', 'KE', 'KL', 'MF', 'TO'],
 ['5J', 'AZ', 'EY', 'FL', 'G8', 'TS', 'TY', 'U6', 'W6', 'WN'],
 ['AA', 'BA', 'CX', 'DY', 'F9', 'FR', 'KM', 'S2', 'S4', 'VA'],
 ['AD', 'AF', 'B6', 'CZ', 'DY', 'EY', 'FR', 'KL', 'RO', 'TG'],
 ['3U', 'AF', 'AR', 'B6', 'GS', 'HX', 'QF', 'TK', 'TP', 'VY']]

In [66]:
## Create 3 partitions by calling balance_partitions
partitions = balance_partitions(airlines, 10)
partitions

[['AD', 'EY', 'GA', 'IB', 'TO'],
 ['AY', 'CZ', 'KE', 'KL', 'MF'],
 ['EY', 'G8', 'TS', 'U6', 'WN'],
 ['5J', 'AZ', 'FL', 'TY', 'W6'],
 ['CX', 'DY', 'FR', 'S4', 'VA'],
 ['AA', 'BA', 'F9', 'KM', 'S2'],
 ['AF', 'CZ', 'DY', 'FR', 'RO'],
 ['AD', 'B6', 'EY', 'KL', 'TG'],
 ['AF', 'AR', 'B6', 'QF', 'TP'],
 ['3U', 'GS', 'HX', 'TK', 'VY']]

In [68]:
## Create 12 partitions by calling balance_partitions
partitions = balance_partitions(airlines, 12)
partitions

[['AD', 'EY', 'IB', 'TO'],
 ['AY', 'CZ', 'GA', 'KE'],
 ['G8', 'KL', 'MF', 'TS'],
 ['EY', 'TY', 'U6', 'WN'],
 ['5J', 'AZ', 'FL', 'W6'],
 ['DY', 'FR', 'S4', 'VA'],
 ['BA', 'CX', 'F9', 'KM'],
 ['AA', 'AF', 'CZ', 'S2'],
 ['DY', 'EY', 'FR', 'RO'],
 ['AD', 'B6', 'KL', 'TG'],
 ['AF', 'B6', 'QF', 'TP'],
 ['AR', 'GS', 'HX', 'VY'],
 ['3U', 'TK']]