In [9]:
import sqlite3
import os
from deltalake import DeltaTable
from deltalake.writer import write_deltalake
import numpy as np
import pandas as pd
conn = sqlite3.connect("database.sqlite")


In [22]:
user_df = pd.read_sql_query("select * from user;", conn)
grade_df = pd.read_sql_query("select * from grade;", conn)
method_df = pd.read_sql_query("select * from method;", conn)
ascent_df = pd.read_sql_query("select * from ascent;", conn)

In [23]:
# Write bronze data
write_deltalake("climbing_project/data/bronze/user", user_df, mode="overwrite")
write_deltalake("climbing_project/data/bronze/grade", grade_df, mode="overwrite")
write_deltalake("climbing_project/data/bronze/method", method_df, mode="overwrite")
write_deltalake("climbing_project/data/bronze/ascent", ascent_df, mode="overwrite")

In [29]:
# Read bronze data
fact_user_bronze_df = DeltaTable("climbing_project/data/bronze/user").to_pandas()
fact_ascent_bronze_df = DeltaTable("climbing_project/data/bronze/ascent").to_pandas()
dim_grade_bronze_df = DeltaTable("climbing_project/data/bronze/grade").to_pandas()
dim_method_bronze_df = DeltaTable("climbing_project/data/bronze/method").to_pandas()


In [None]:
# Apply transformations for silver table

In [69]:
# METHOD rename column name to climb_type
dim_method_silver_df = dim_method_bronze_df.rename(columns={'shorthand':'climb_type'})

In [70]:
# GRADE
dim_grade_columns = ['id', 'score', 'fra_routes', 'fra_boulders']
dim_grade_silver_df = dim_grade_bronze_df[dim_grade_columns]

In [71]:
# USER
fact_user_silver_df = fact_user_bronze_df[fact_user_bronze_df['deactivated'] == 0]
user_silver_columns = ['id', 'country', 'sex', 'height', 'weight', 'started']
fact_user_silver_df = fact_user_silver_df[user_silver_columns]

In [90]:
# ASCENT Add column to convert unix time to timestamp, select relevant columns and filter to year between 1980 and 2017
fact_ascent_bronze_df['datetime'] = pd.to_datetime(fact_ascent_bronze_df['date'], unit='s')
ascent_silver_columns = ['id','user_id', 'grade_id', 'method_id', 'climb_type', 'year', 'chipped']
fact_ascent_silver_df = fact_ascent_bronze_df[ascent_silver_columns]
fact_ascent_silver_df = fact_ascent_silver_df[(fact_ascent_silver_df['year'] >= 1980) & (fact_ascent_silver_df['year'] <= 2017)]

# Join to grade and method tables
fact_ascent_silver_grade_df = pd.merge(fact_ascent_silver_df, dim_grade_silver_df, left_on='grade_id', right_on='id',suffixes=('_ascent', '_grade'))
fact_ascent_silver_final_df = pd.merge(fact_ascent_silver_grade_df, dim_method_silver_df, left_on='method_id', right_on='id', suffixes=('_ascent', '_method'))
fact_ascent_silver_final_columns = ['id_ascent', 'user_id', 'year', 'chipped', 'score_ascent', 'fra_routes', 'fra_boulders', 'climb_type_method']
fact_ascent_silver_final_df = fact_ascent_silver_final_df.dropna(subset=['fra_routes'])
fact_ascent_silver_final_df = fact_ascent_silver_final_df[fact_ascent_silver_final_columns]

In [91]:
# Write to delta lake
write_deltalake("climbing_project/data/silver/user", fact_user_silver_df, mode="overwrite")
write_deltalake("climbing_project/data/silver/grade", dim_grade_silver_df, mode="overwrite")
write_deltalake("climbing_project/data/silver/method", dim_method_silver_df, mode="overwrite")
write_deltalake("climbing_project/data/silver/ascent", fact_ascent_silver_final_df, mode="overwrite")

In [92]:
# Read silver tables
fact_user_silver_df_2 = DeltaTable("climbing_project/data/silver/user").to_pandas()
fact_ascent_silver_df_2 = DeltaTable("climbing_project/data/silver/ascent").to_pandas()


In [83]:
fact_user_silver_df_2

Unnamed: 0,id,country,sex,height,weight,started,__index_level_0__
0,1,SWE,0,177,73,1996,0
1,2,SWE,0,0,0,2000,1
2,3,SWE,0,180,78,1995,2
3,4,SWE,1,165,58,2001,3
4,5,USA,0,0,0,1991,4
...,...,...,...,...,...,...,...
60028,67021,ESP,0,180,78,0,62588
60029,67022,FRA,0,185,68,2016,62589
60030,67023,USA,0,190,88,2001,62590
60031,67024,POL,1,0,0,0,62591


In [102]:
# Create gold tables

# Average height by grade
merged_df = fact_user_silver_df_2.merge(fact_ascent_silver_df_2, left_on='id', right_on='user_id')
avg_height_by_grade_year = merged_df.groupby(['fra_routes'])['height'].mean().reset_index()

# Users grouped y countries
users_grouped_by_country_df = fact_user_silver_df_2['country'].value_counts()

In [103]:
print(users_grouped_by_country_df.to_string())

country
USA      11471
ESP       6590
DEU       3505
SWE       3434
ITA       3380
FRA       3185
POL       3183
GBR       2596
BRA       1976
CAN       1834
AUT       1811
NOR       1727
CHE       1334
AUS       1141
ZAF       1121
RUS       1017
NLD        880
BEL        617
PRT        596
MEX        596
CZE        570
SVN        525
HRV        483
DNK        473
NZL        414
FIN        395
ROM        358
BGR        295
UKR        291
GRC        253
HUN        244
TUR        199
ARG        199
JPN        174
IRN        173
SVK        159
none       157
CHL        151
VEN        145
CHN        141
ISR        140
COL        105
SGP        101
IRL         96
            94
IND         89
ISL         82
SRB         74
KOR         74
AND         68
LTU         67
MSR         65
MYS         59
HKG         57
PER         52
LUX         51
ECU         46
TWN         43
MKD         43
PHL         38
FXX         37
THA         31
IDN         31
YUG         30
ZWE         30
REU         29
LV

In [104]:
fact_ascent_silver_df_2

Unnamed: 0,id_ascent,user_id,year,chipped,score_ascent,fra_routes,fra_boulders,climb_type_method
0,2,1,1999,0,400,6a,6A,onsight
1,3,1,1999,0,400,6a,6A,onsight
2,4,1,1999,0,400,6a,6A,onsight
3,5,1,1999,0,400,6a,6A,onsight
4,6,1,1999,0,400,6a,6A,onsight
...,...,...,...,...,...,...,...,...
4111773,4910370,59627,2017,0,650,6c+,6C+,redpoint
4111774,4910371,44075,2017,0,700,7a,7A,redpoint
4111775,4910376,19098,2017,0,750,7a+,7A+,onsight
4111776,4910379,19098,2017,0,900,7c,7C,onsight
