In [1]:
# import packages
import os
import pandas as pd
import numpy as np
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import re
from dvc.api import make_checkpoint

In [2]:
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/project/spark-3.2.1-bin-hadoop3.2"

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("PySpark App") \
    .config("spark.jars", "postgresql-42.3.2.jar") \
    .getOrCreate()

In [4]:
# read the parquet files
w_m_game_df = spark.read.parquet("/project/DataEngineering/parquet_files/w_m_game.parquet").toPandas()
relay_df =  spark.read.parquet("/project/DataEngineering/parquet_files/relay.parquet").toPandas()
athlete_info_df = spark.read.parquet("/project/DataEngineering/parquet_files/athlete_info.parquet").toPandas()

In [5]:
# replace all the ROC with RUS
w_m_game_df.country = w_m_game_df.country.replace(to_replace = 'ROC', value = 'RUS')
relay_df.country = relay_df.country.replace(to_replace = 'ROC', value = 'RUS')
relay_df.name = relay_df.name.replace(to_replace = 'ROC', value = 'RUS')
make_checkpoint()

In [6]:
# set the athlete name into lower cases
w_m_game_df ['name'] = w_m_game_df['name'].apply(lambda x: x.lower())
make_checkpoint()

In [7]:
# replace the special cases
w_m_game_df.name = w_m_game_df.name.replace('blais danae', 'blais danaé')
w_m_game_df.name = w_m_game_df.name.replace('han yutong', 'han yu tong')
w_m_game_df.name = w_m_game_df.name.replace('lepape sebastien', 'lepape sébastien')
w_m_game_df.name = w_m_game_df.name.replace('lee juneseo', 'lee june seo')
w_m_game_df.name = w_m_game_df.name.replace('park janghyuk', 'park jang hyuk')
w_m_game_df.name = w_m_game_df.name.replace('airapetian denis', 'ayrapetyan denis')
make_checkpoint()

In [8]:
# check the special cases in qualified column
w_m_game_df[w_m_game_df['qualified'].apply(lambda x: len(x)>1)]['qualified']

32      [OR, Q]
70      [OR, Q]
141     [WR, Q]
191     [OR, Q]
227    [OR, QA]
348     [OR, Q]
Name: qualified, dtype: object

In [10]:
# merge the athlete information with the women and men game information
w_m_game_df = w_m_game_df.merge(athlete_info_df[['name', 'id']], how = 'left', on = 'name')
make_checkpoint()

In [11]:
def get_qualified(x):
    '''
    only retrun the qualified details
    '''
    if len(x) == 1 and x[0] != 'OR':
        return x[0]
    elif len(x) == 2:
        return x[1]
    else:
        return ''

In [12]:
# apply the function on both women & men game information and relay game information
w_m_game_df.qualified = w_m_game_df['qualified'].apply(get_qualified)
make_checkpoint()
relay_df.qualified = relay_df['qualified'].apply(get_qualified)
make_checkpoint()

In [13]:
# check the information returned
set(w_m_game_df.qualified)

{'', 'ADV', 'ADVA', 'ADVB', 'Q', 'QA', 'QB', 'q'}

In [14]:
# check the information returned
set(relay_df.qualified)

{'', 'ADVA', 'Q', 'QA', 'QB', 'q'}

In [15]:
def get_special_cases(df):
    '''
    convert the special cases in time column into the qualified column
    '''
    index = df[df['time'].isin([ 'PEN', 'No Time', 'DNS', 'YC','DNF']) & (df['qualified']=='')].index
    df['qualified'].loc[index] = df['time'].loc[index]
    return df

In [16]:
# apply to both data frame
w_m_game_df = get_special_cases(w_m_game_df)
make_checkpoint()
relay_df = get_special_cases(relay_df)
make_checkpoint()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [17]:
w_m_game_df

Unnamed: 0,country,helmet_number,name,group,game,rank,time,qualified,id
0,CAN,50,charles alyson,000100-,W500M QFNL,4,1:07.206,ADV,STCAN23010199801
1,CAN,14,brunelle florence,000100-,W500M QFNL,5,PEN,PEN,STCAN22012200301
2,HUN,10,jaszapati petra,000200-,W500M QFNL,1,43.476,Q,STHUN23112199801
3,RUS,141,seregina elena,000200-,W500M QFNL,2,43.712,Q,STRUS23012200101
4,USA,52,biney maame,000200-,W500M QFNL,3,46.099,,STUSA22801200001
...,...,...,...,...,...,...,...,...,...
424,USA,19,heo andrew,000100-,M1000M QFNL,1,1:24.603,Q,STUSA10705200101
425,CHN,48,wu dajing,000100-,M1000M QFNL,2,1:33.302,Q,STCHN12407199401
426,KOR,195,park jang hyuk,000100-,M1000M QFNL,3,No Time,ADV,STKOR13110199801
427,ITA,7,sighel pietro,000100-,M1000M QFNL,4,PEN,PEN,STITA11507199901


In [18]:
relay_df

Unnamed: 0,country,name,group,game,rank,time,qualified
0,KOR,Republic of Korea,000200,M5000MRY4 SFNL,1,6:37.879,QA
1,RUS,RUS,000200,M5000MRY4 SFNL,2,6:37.925,QA
2,NED,Netherlands,000200,M5000MRY4 SFNL,3,6:37.927,QB
3,HUN,Hungary,000200,M5000MRY4 SFNL,4,6:45.172,QB
4,CHN,People's Republic of China,000100,XRELAY4 QFNL,1,2:37.535,Q
5,ITA,Italy,000100,XRELAY4 QFNL,2,2:38.308,Q
6,KOR,Republic of Korea,000100,XRELAY4 QFNL,3,2:48.308,
7,POL,Poland,000100,XRELAY4 QFNL,4,2:50.513,
8,NED,Netherlands,000200,XRELAY4 QFNL,1,2:36.437,Q
9,CAN,Canada,000200,XRELAY4 QFNL,2,2:36.747,Q


In [19]:
# check there are only two types of time format in time column
w_m_game_df[~w_m_game_df['time'].apply(lambda x: ':' in x or '.' in x)]

Unnamed: 0,country,helmet_number,name,group,game,rank,time,qualified,id
1,CAN,14,brunelle florence,000100-,W500M QFNL,5,PEN,PEN,STCAN22012200301
6,NED,6,velzeboer xandra,000200-,W500M QFNL,5,PEN,PEN,STNED20709200101
11,RUS,7,prosvirnova sofia,000300-,W500M QFNL,5,PEN,PEN,STRUS22012199701
15,USA,8,santos kristen,000400-,W500M QFNL,4,PEN,PEN,STUSA20211199401
16,ITA,13,valcepina martina,000400-,W500M QFNL,5,PEN,PEN,STITA20406199201
20,KOR,52,hwang daeheon,000100-,M1000M SFNL,4,PEN,PEN,STKOR10507199901
21,KOR,195,park jang hyuk,000100-,M1000M SFNL,5,DNS,DNS,STKOR13110199801
27,KOR,46,lee june seo,000200-,M1000M SFNL,6,PEN,PEN,STKOR10306200001
39,HUN,27,konya zsofia,000300-,W1000M HEAT,4,No Time,No Time,STHUN20602199501
43,RUS,7,prosvirnova sofia,000400-,W1000M HEAT,4,PEN,PEN,STRUS22012199701


In [20]:
# check there are only two types of time format in time column
relay_df[~relay_df['time'].apply(lambda x: ':' in x or '.' in x)]

Unnamed: 0,country,name,group,game,rank,time,qualified
44,RUS,RUS,000200,XRELAY4 SFNL,3,PEN,PEN
45,USA,United States of America,000200,XRELAY4 SFNL,4,PEN,PEN
52,RUS,RUS,-B0010,W3000MRY4 FNL,3,PEN,PEN
53,USA,United States of America,-B0010,W3000MRY4 FNL,4,PEN,PEN
57,CAN,Canada,-A0010,XRELAY4 FNL,4,PEN,PEN


In [21]:
def timestamp(x):
    '''
    convert the string of time into timestamp format
    '''
    if ':' in x:
        return datetime.strptime(x,'%M:%S.%f').timestamp()
    elif  '.' in x:
        return datetime.strptime(x, '%S.%f').timestamp()
    else:
        return 0

In [22]:
# apply the function
w_m_game_df['timestamp'] = w_m_game_df['time'].apply(timestamp)
make_checkpoint()
relay_df['timestamp'] = relay_df['time'].apply(timestamp)
make_checkpoint()

In [23]:
# split the game column into two columns game_type and level
w_m_game_df[['game_type', 'level']] = w_m_game_df.game.str.split(expand = True)
make_checkpoint()
relay_df[['game_type', 'level']] = relay_df.game.str.split(expand = True)
make_checkpoint()

In [24]:
# create a new column and filled with 0
w_m_game_df['rank_by_game'] = 0
relay_df['rank_by_game'] = 0

In [25]:
def rank_game(df):
    '''
    function to get the overall rank for each game and event
    '''
    temp_other = df[df['level']!='FNL']
    temp_other['rank_by_game'] = temp_other.groupby(['level', 'game_type'])['timestamp'].rank(method='min', ascending = True).astype(np.int64)
    
    temp_fnl = df[df['level'] == 'FNL']
    list_game = set(temp_fnl['game_type'])
    for g in list_game:
        index_A = temp_fnl[(temp_fnl['game_type'] == g) & (temp_fnl['group'].apply(lambda x: 'A' in x))].index
        index_B = temp_fnl[(temp_fnl['game_type'] == g) & (temp_fnl['group'].apply(lambda x: 'B' in x))].sort_values('rank').index
        max_rank_groupA = int(temp_fnl[(temp_fnl['game_type'] == g) & (temp_fnl['group'].apply(lambda x: 'A' in x))]['rank'].max())
        rank_B = [i+max_rank_groupA for i in range(1,len(index_B)+1)]
        
        temp_fnl['rank_by_game'].loc[index_A] = temp_fnl['rank'].loc[index_A]
        temp_fnl['rank_by_game'].loc[index_B] = rank_B
        
    return temp_other.append(temp_fnl)

In [26]:
# apply the function
w_m_game_df = rank_game(w_m_game_df)
make_checkpoint()
relay_df = rank_game(relay_df)
make_checkpoint()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_other['rank_by_game'] = temp_other.groupby(['level', 'game_type'])['timestamp'].rank(method='min', ascending = True).astype(np.int64)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [27]:
# convert the type into integer
w_m_game_df.rank_by_game = w_m_game_df.rank_by_game.astype(np.int64)
make_checkpoint()
relay_df.rank_by_game = relay_df.rank_by_game.astype(np.int64)
make_checkpoint()

In [28]:
w_m_game_df

Unnamed: 0,country,helmet_number,name,group,game,rank,time,qualified,id,timestamp,game_type,level,rank_by_game
0,CAN,50,charles alyson,000100-,W500M QFNL,4,1:07.206,ADV,STCAN23010199801,-2.208989e+09,W500M,QFNL,14
1,CAN,14,brunelle florence,000100-,W500M QFNL,5,PEN,PEN,STCAN22012200301,0.000000e+00,W500M,QFNL,16
2,HUN,10,jaszapati petra,000200-,W500M QFNL,1,43.476,Q,STHUN23112199801,-2.208989e+09,W500M,QFNL,7
3,RUS,141,seregina elena,000200-,W500M QFNL,2,43.712,Q,STRUS23012200101,-2.208989e+09,W500M,QFNL,8
4,USA,52,biney maame,000200-,W500M QFNL,3,46.099,,STUSA22801200001,-2.208989e+09,W500M,QFNL,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
387,CHN,48,wu dajing,-A00100,M1000M FNL,4,1:42.937,,STCHN12407199401,-2.208989e+09,M1000M,FNL,4
388,HUN,2,liu shaolin sandor,-A00100,M1000M FNL,5,YC,YC,STHUN12011199501,0.000000e+00,M1000M,FNL,5
389,NED,5,de laat itzhak,-B00100,M1000M FNL,1,1:35.925,,STNED11306199401,-2.208989e+09,M1000M,FNL,6
390,TUR,32,akar furkan,-B00100,M1000M FNL,2,1:36.052,,STTUR10603200201,-2.208989e+09,M1000M,FNL,7


In [29]:
relay_df

Unnamed: 0,country,name,group,game,rank,time,qualified,timestamp,game_type,level,rank_by_game
0,KOR,Republic of Korea,000200,M5000MRY4 SFNL,1,6:37.879,QA,-2208988000.0,M5000MRY4,SFNL,1
1,RUS,RUS,000200,M5000MRY4 SFNL,2,6:37.925,QA,-2208988000.0,M5000MRY4,SFNL,2
2,NED,Netherlands,000200,M5000MRY4 SFNL,3,6:37.927,QB,-2208988000.0,M5000MRY4,SFNL,3
3,HUN,Hungary,000200,M5000MRY4 SFNL,4,6:45.172,QB,-2208988000.0,M5000MRY4,SFNL,7
4,CHN,People's Republic of China,000100,XRELAY4 QFNL,1,2:37.535,Q,-2208989000.0,XRELAY4,QFNL,3
5,ITA,Italy,000100,XRELAY4 QFNL,2,2:38.308,Q,-2208989000.0,XRELAY4,QFNL,4
6,KOR,Republic of Korea,000100,XRELAY4 QFNL,3,2:48.308,,-2208989000.0,XRELAY4,QFNL,10
7,POL,Poland,000100,XRELAY4 QFNL,4,2:50.513,,-2208989000.0,XRELAY4,QFNL,11
8,NED,Netherlands,000200,XRELAY4 QFNL,1,2:36.437,Q,-2208989000.0,XRELAY4,QFNL,1
9,CAN,Canada,000200,XRELAY4 QFNL,2,2:36.747,Q,-2208989000.0,XRELAY4,QFNL,2


In [30]:
# example of one of the game
w_m_game_df[(w_m_game_df['game_type'] == 'M1000M') & (w_m_game_df['level'] == 'QFNL')].sort_values('rank_by_game')

Unnamed: 0,country,helmet_number,name,group,game,rank,time,qualified,id,timestamp,game_type,level,rank_by_game
107,KOR,46,lee june seo,000200-,M1000M QFNL,1,1:23.682,Q,STKOR10306200001,-2208989000.0,M1000M,QFNL,1
108,HUN,1,liu shaoang,000200-,M1000M QFNL,2,1:23.940,Q,STHUN11303199801,-2208989000.0,M1000M,QFNL,2
109,FRA,73,fercoq quentin,000200-,M1000M QFNL,3,1:24.411,,STFRA10503199901,-2208989000.0,M1000M,QFNL,3
424,USA,19,heo andrew,000100-,M1000M QFNL,1,1:24.603,Q,STUSA10705200101,-2208989000.0,M1000M,QFNL,4
117,KOR,52,hwang daeheon,000400-,M1000M QFNL,1,1:24.693,Q,STKOR10507199901,-2208989000.0,M1000M,QFNL,5
112,TUR,32,akar furkan,000300-,M1000M QFNL,1,1:25.490,Q,STTUR10603200201,-2208989000.0,M1000M,QFNL,6
118,CHN,94,li wenlong,000400-,M1000M QFNL,2,1:30.550,Q,STCHN10402200101,-2208989000.0,M1000M,QFNL,7
425,CHN,48,wu dajing,000100-,M1000M QFNL,2,1:33.302,Q,STCHN12407199401,-2208989000.0,M1000M,QFNL,8
113,CHN,54,ren ziwei,000300-,M1000M QFNL,2,1:34.211,Q,STCHN10306199701,-2208989000.0,M1000M,QFNL,9
114,NED,5,de laat itzhak,000300-,M1000M QFNL,3,1:42.490,ADV,STNED11306199401,-2208989000.0,M1000M,QFNL,10


In [31]:
# get all the country information
URL =f"http://www.shorttrackonline.info/athletes.php?"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")
soup_body = str(soup.body)
dict_country = {i.split('">')[0]:i.split('">')[1] for i in re.findall(r'country=(.*)</a', soup_body)}

In [32]:
# convert the dictionary into data frame
countries = pd.DataFrame({'country_code':dict_country.keys(), 'name': dict_country.values()})
make_checkpoint()

In [33]:
countries

Unnamed: 0,country_code,name
0,ARG,Argentina
1,KAZ,Kazakhstan
2,AUS,Australia
3,LAT,Latvia
4,AUT,Austria
5,LTU,Lithuania
6,BLR,Belarus
7,LUX,Luxembourg
8,BEL,Belgium
9,MAS,Malaysia


In [34]:
# based on the schema get all the tables
heat = pd.DataFrame(w_m_game_df[w_m_game_df['level'] == 'HEAT'][['id', 'game_type', 'group', 'timestamp', 'rank', 'rank_by_game', 'qualified']].reset_index(drop = True))
qfnl = pd.DataFrame(w_m_game_df[w_m_game_df['level'] == 'QFNL'][['id', 'game_type', 'group', 'timestamp', 'rank', 'rank_by_game', 'qualified']].reset_index(drop = True))
sfnl = pd.DataFrame(w_m_game_df[w_m_game_df['level'] == 'SFNL'][['id', 'game_type', 'group', 'timestamp', 'rank', 'rank_by_game', 'qualified']].reset_index(drop = True))
fnl = pd.DataFrame(w_m_game_df[w_m_game_df['level'] == 'FNL'][['id', 'game_type', 'group', 'timestamp', 'rank', 'rank_by_game']].reset_index(drop = True))
make_checkpoint()

In [35]:
# change the columns name
heat.columns = ['id', 'game_type', 'group', 'time', 'inGroup_rank', 'game_rank', 'qualified']
qfnl.columns = ['id', 'game_type', 'group', 'time', 'inGroup_rank', 'game_rank', 'qualified']
sfnl.columns = ['id', 'game_type', 'group', 'time', 'inGroup_rank', 'game_rank', 'qualified']
fnl.columns = ['id', 'game_type', 'group', 'time', 'inGroup_rank', 'game_rank']
make_checkpoint()

In [36]:
heat

Unnamed: 0,id,game_type,group,time,inGroup_rank,game_rank,qualified
0,STKOR20909199801,W1000M,000100-,-2.208989e+09,1,9,Q
1,STNED21405199901,W1000M,000100-,-2.208989e+09,2,10,Q
2,STRUS22408200201,W1000M,000100-,-2.208989e+09,3,12,q
3,STGBR22601199601,W1000M,000100-,-2.208989e+09,4,22,
4,STNED22509199701,W1000M,000200-,-2.208989e+09,1,1,Q
...,...,...,...,...,...,...,...
123,STNED10610200101,M500M,000700-,0.000000e+00,4,28,PEN
124,STCHN12407199401,M500M,000800-,-2.208989e+09,1,1,Q
125,STITA11507199901,M500M,000800-,-2.208989e+09,2,3,Q
126,STHKG12207199901,M500M,000800-,-2.208989e+09,3,27,


In [37]:
qfnl

Unnamed: 0,id,game_type,group,time,inGroup_rank,game_rank,qualified
0,STCAN23010199801,W500M,000100-,-2.208989e+09,4,14,ADV
1,STCAN22012200301,W500M,000100-,0.000000e+00,5,16,PEN
2,STHUN23112199801,W500M,000200-,-2.208989e+09,1,7,Q
3,STRUS23012200101,W500M,000200-,-2.208989e+09,2,8,Q
4,STUSA22801200001,W500M,000200-,-2.208989e+09,3,9,
...,...,...,...,...,...,...,...
147,STUSA10705200101,M1000M,000100-,-2.208989e+09,1,4,Q
148,STCHN12407199401,M1000M,000100-,-2.208989e+09,2,8,Q
149,STKOR13110199801,M1000M,000100-,0.000000e+00,3,13,ADV
150,STITA11507199901,M1000M,000100-,0.000000e+00,4,13,PEN


In [38]:
sfnl

Unnamed: 0,id,game_type,group,time,inGroup_rank,game_rank,qualified
0,STCHN10306199701,M1000M,000100-,-2.208989e+09,1,5,QA
1,STCHN10402200101,M1000M,000100-,-2.208989e+09,2,6,QA
2,STTUR10603200201,M1000M,000100-,-2.208989e+09,3,7,QB
3,STKOR10507199901,M1000M,000100-,0.000000e+00,4,9,PEN
4,STKOR13110199801,M1000M,000100-,0.000000e+00,5,9,DNS
...,...,...,...,...,...,...,...
79,STNED22509199701,W500M,000200-,-2.208989e+09,1,2,QA
80,STCHN20408199901,W500M,000200-,-2.208989e+09,2,6,QA
81,STHUN23112199801,W500M,000200-,-2.208989e+09,3,7,QB
82,STBEL22610199601,W500M,000200-,-2.208989e+09,4,9,ADVA


In [39]:
fnl

Unnamed: 0,id,game_type,group,time,inGroup_rank,game_rank
0,STNED22509199701,W1000M,-A00100,-2.208989e+09,1,1
1,STKOR20909199801,W1000M,-A00100,-2.208989e+09,2,2
2,STBEL22610199601,W1000M,-A00100,-2.208989e+09,3,3
3,STUSA20211199401,W1000M,-A00100,-2.208989e+09,4,4
4,STITA21404199001,W1000M,-A00100,0.000000e+00,5,5
...,...,...,...,...,...,...
60,STCHN12407199401,M1000M,-A00100,-2.208989e+09,4,4
61,STHUN12011199501,M1000M,-A00100,0.000000e+00,5,5
62,STNED11306199401,M1000M,-B00100,-2.208989e+09,1,6
63,STTUR10603200201,M1000M,-B00100,-2.208989e+09,2,7


In [40]:
# based on the schema and get all the tables
relay_qfnl = pd.DataFrame(relay_df[relay_df['level'] == 'QFNL'][['country', 'game_type', 'group', 'timestamp', 'rank', 'rank_by_game', 'qualified']].reset_index(drop = True))
relay_sfnl = pd.DataFrame(relay_df[relay_df['level'] == 'SFNL'][['country', 'game_type', 'group', 'timestamp', 'rank', 'rank_by_game', 'qualified']].reset_index(drop = True))
relay_fnl = pd.DataFrame(relay_df[relay_df['level'] == 'FNL'][['country', 'game_type', 'group', 'timestamp', 'rank', 'rank_by_game']].reset_index(drop = True))
make_checkpoint()

In [41]:
# change the columns name
relay_qfnl.columns = ['country_code', 'game_type', 'group', 'time', 'inGroup_rank', 'game_rank', 'qualified']
relay_sfnl.columns = ['country_code', 'game_type', 'group', 'time', 'inGroup_rank', 'game_rank', 'qualified']
relay_fnl.columns = ['country_code', 'game_type', 'group', 'time', 'inGroup_rank', 'game_rank']
make_checkpoint()

In [42]:
relay_qfnl

Unnamed: 0,country_code,game_type,group,time,inGroup_rank,game_rank,qualified
0,CHN,XRELAY4,100,-2208989000.0,1,3,Q
1,ITA,XRELAY4,100,-2208989000.0,2,4,Q
2,KOR,XRELAY4,100,-2208989000.0,3,10,
3,POL,XRELAY4,100,-2208989000.0,4,11,
4,NED,XRELAY4,200,-2208989000.0,1,1,Q
5,CAN,XRELAY4,200,-2208989000.0,2,2,Q
6,KAZ,XRELAY4,200,-2208989000.0,3,9,q
7,FRA,XRELAY4,200,-2208989000.0,4,12,
8,HUN,XRELAY4,300,-2208989000.0,1,5,Q
9,RUS,XRELAY4,300,-2208989000.0,2,6,Q


In [43]:
relay_sfnl

Unnamed: 0,country_code,game_type,group,time,inGroup_rank,game_rank,qualified
0,KOR,M5000MRY4,200,-2208988000.0,1,1,QA
1,RUS,M5000MRY4,200,-2208988000.0,2,2,QA
2,NED,M5000MRY4,200,-2208988000.0,3,3,QB
3,HUN,M5000MRY4,200,-2208988000.0,4,7,QB
4,NED,W3000MRY4,100,-2208989000.0,1,1,QA
5,CHN,W3000MRY4,100,-2208989000.0,2,2,QA
6,POL,W3000MRY4,100,-2208989000.0,3,7,QB
7,ITA,W3000MRY4,100,-2208989000.0,4,8,QB
8,CAN,W3000MRY4,200,-2208989000.0,1,3,QA
9,KOR,W3000MRY4,200,-2208989000.0,2,4,QA


In [44]:
relay_fnl

Unnamed: 0,country_code,game_type,group,time,inGroup_rank,game_rank
0,NED,XRELAY4,-B0010,-2208989000.0,1,5
1,KAZ,XRELAY4,-B0010,-2208989000.0,2,6
2,CAN,M5000MRY4,-A0010,-2208988000.0,1,1
3,KOR,M5000MRY4,-A0010,-2208988000.0,2,2
4,ITA,M5000MRY4,-A0010,-2208988000.0,3,3
5,RUS,M5000MRY4,-A0010,-2208988000.0,4,4
6,CHN,M5000MRY4,-A0010,-2208988000.0,5,5
7,HUN,M5000MRY4,-B0010,-2208988000.0,1,6
8,NED,M5000MRY4,-B0010,-2208988000.0,2,7
9,JPN,M5000MRY4,-B0010,-2208988000.0,3,8


In [45]:
# get the information needed for athlete information
athlete = pd.DataFrame(athlete_info_df[['id','name', 'country', 'birth_year', 'age_category', 'club']])
make_checkpoint()

In [46]:
# change the columns
athlete.columns = ['id', 'name', 'country_code', 'birth_year', 'age_category', 'club']
make_checkpoint()

In [47]:
# change the data type into int type
athlete.birth_year = athlete.birth_year.astype(np.int64)

In [48]:
# convert all the data frame into spark data frame
heat_df = spark.createDataFrame(heat)
qfnl_df = spark.createDataFrame(qfnl)
sfnl_df = spark.createDataFrame(sfnl)
fnl_df = spark.createDataFrame(fnl)
relay_qfnl_df = spark.createDataFrame(relay_qfnl)
relay_sfnl_df = spark.createDataFrame(relay_sfnl)
relay_fnl_df = spark.createDataFrame(relay_fnl)
countries_df = spark.createDataFrame(countries)
athlete_df = spark.createDataFrame(athlete)
make_checkpoint()

In [49]:
heat_df.printSchema()
qfnl_df.printSchema()
sfnl_df.printSchema()
fnl_df.printSchema()
relay_qfnl_df.printSchema()
relay_sfnl_df.printSchema()
relay_fnl_df.printSchema()
countries_df.printSchema()
athlete_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- game_type: string (nullable = true)
 |-- group: string (nullable = true)
 |-- time: double (nullable = true)
 |-- inGroup_rank: string (nullable = true)
 |-- game_rank: long (nullable = true)
 |-- qualified: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- game_type: string (nullable = true)
 |-- group: string (nullable = true)
 |-- time: double (nullable = true)
 |-- inGroup_rank: string (nullable = true)
 |-- game_rank: long (nullable = true)
 |-- qualified: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- game_type: string (nullable = true)
 |-- group: string (nullable = true)
 |-- time: double (nullable = true)
 |-- inGroup_rank: string (nullable = true)
 |-- game_rank: long (nullable = true)
 |-- qualified: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- game_type: string (nullable = true)
 |-- group: string (nullable = true)
 |-- time: double (nullable = true)
 |-- inGroup_

In [50]:
def change_data_type(df, col, type_):
    '''
    function to change the data type for spark data frame
    '''
    return df.withColumn(col,  df[col].cast(type_))

In [51]:
athlete_df = change_data_type(athlete_df, 'birth_year', 'int')
heat_df = change_data_type(heat_df, 'time', 'timestamp')
heat_df = change_data_type(heat_df, 'inGroup_rank', 'int')
heat_df = change_data_type(heat_df, 'game_rank', 'int')
make_checkpoint()

qfnl_df = change_data_type(qfnl_df, 'time', 'timestamp')
qfnl_df = change_data_type(qfnl_df, 'inGroup_rank', 'int')
qfnl_df = change_data_type(qfnl_df, 'game_rank', 'int')
make_checkpoint()

sfnl_df = change_data_type(sfnl_df, 'time', 'timestamp')
sfnl_df = change_data_type(sfnl_df, 'inGroup_rank', 'int')
sfnl_df = change_data_type(sfnl_df, 'game_rank', 'int')
make_checkpoint()

fnl_df = change_data_type(fnl_df, 'time', 'timestamp')
fnl_df = change_data_type(fnl_df, 'inGroup_rank', 'int')
fnl_df = change_data_type(fnl_df, 'game_rank', 'int')
make_checkpoint()

relay_qfnl_df = change_data_type(relay_qfnl_df, 'time', 'timestamp')
relay_qfnl_df = change_data_type(relay_qfnl_df, 'inGroup_rank', 'int')
relay_qfnl_df = change_data_type(relay_qfnl_df, 'game_rank', 'int')
make_checkpoint()

relay_sfnl_df = change_data_type(relay_sfnl_df, 'time', 'timestamp')
relay_sfnl_df = change_data_type(relay_sfnl_df, 'inGroup_rank', 'int')
relay_sfnl_df = change_data_type(relay_sfnl_df, 'game_rank', 'int')
make_checkpoint()

relay_fnl_df = change_data_type(relay_fnl_df, 'time', 'timestamp')
relay_fnl_df = change_data_type(relay_fnl_df, 'inGroup_rank', 'int')
relay_fnl_df = change_data_type(relay_fnl_df, 'game_rank', 'int')
make_checkpoint()

In [52]:
heat_df.printSchema()
qfnl_df.printSchema()
sfnl_df.printSchema()
fnl_df.printSchema()
relay_qfnl_df.printSchema()
relay_sfnl_df.printSchema()
relay_fnl_df.printSchema()
countries_df.printSchema()
athlete_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- game_type: string (nullable = true)
 |-- group: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- inGroup_rank: integer (nullable = true)
 |-- game_rank: integer (nullable = true)
 |-- qualified: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- game_type: string (nullable = true)
 |-- group: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- inGroup_rank: integer (nullable = true)
 |-- game_rank: integer (nullable = true)
 |-- qualified: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- game_type: string (nullable = true)
 |-- group: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- inGroup_rank: integer (nullable = true)
 |-- game_rank: integer (nullable = true)
 |-- qualified: string (nullable = true)

root
 |-- id: string (nullable = true)
 |-- game_type: string (nullable = true)
 |-- group: string (nullable = true)
 |-- time: timestamp (nullab

In [54]:
# convert all data frames into parquet files
heat_df.write.parquet("/project/DataEngineering/parquet_files/heat.parquet", mode = 'overwrite')
qfnl_df.write.parquet("/project/DataEngineering/parquet_files/qfnl.parquet", mode = 'overwrite')
sfnl_df.write.parquet("/project/DataEngineering/parquet_files/sfnl.parquet", mode = 'overwrite')
fnl_df.write.parquet("/project/DataEngineering/parquet_files/fnl.parquet", mode = 'overwrite')
relay_qfnl_df.write.parquet("/project/DataEngineering/parquet_files/relay_qfnl.parquet", mode = 'overwrite')
relay_sfnl_df.write.parquet("/project/DataEngineering/parquet_files/relay_sfnl.parquet", mode = 'overwrite')
relay_fnl_df.write.parquet("/project/DataEngineering/parquet_files/relay_fnl.parquet", mode = 'overwrite')
countries_df.write.parquet("/project/DataEngineering/parquet_files/countries.parquet", mode = 'overwrite')
athlete_df.write.parquet("/project/DataEngineering/parquet_files/athlete.parquet", mode = 'overwrite')
make_checkpoint()