In [78]:
from config import views
from spark import createSession

from typing import List

from pyspark.sql.dataframe import DataFrame

import pyspark.sql.functions as F
import pyspark.sql.types as T

from IPython.display import display

In [79]:
def get_columns_of_type(data_frame: DataFrame, type: str) -> List[str]:
    return [column[0] for column in data_frame.dtypes if column[1] == type]

In [80]:
LENGTH = 80
def show_table_name(table: str) -> None:
    print('=' * LENGTH)
    print(' ' * ((LENGTH - len(table)) // 2), table.upper())
    print('=' * LENGTH)

def show_column_name(column: str) -> None:
    print(column.upper())

In [81]:
VERSION = 'v1'

views = views(VERSION)
spark = createSession()

for view, file in views.items():
    df = spark.read.json(file)
    for column in get_columns_of_type(df, 'boolean'):
        df = df.withColumn(column, F.col(column).cast(T.IntegerType()))
    df.createOrReplaceTempView(view)

                                                                                

In [82]:
DATA_FRAMES = list(zip(views.keys(), [spark.sql(f"SELECT * FROM {view}") for view in views.keys()]))

In [83]:
for view, df in DATA_FRAMES:
    show_table_name(view)
    display(df.toPandas())

                                     ARTISTS


Unnamed: 0,genres,id,name
0,"[filmi, indian folk, indian rock, kannada pop]",72578usTM6Cj5qWsi471Nc,Raghu Dixit
1,"[desi pop, hindi indie, indian indie, indian r...",7b6Ui7JVaBDEfZB9k6nHL0,The Local Train
2,[indian folk],4bvGDTEPFnllKiJaEZGuXk,Achint
3,"[opm, pinoy hip hop, pinoy r&b, pinoy trap, ta...",0n4a5imdLBN24fIrBWoqrv,Because
4,"[hindi indie, indian indie, indian singer-song...",4gdMJYnopf2nEUcanAwstx,Anuv Jain
...,...,...,...
27519,[italian hip hop],2My6j5BEgOi8VHi5WGVyfw,Apocalypshit Army
27520,[belgian pop],0bzW9kGcTyMxXuG9dUdj7E,GRANDGEORGE
27521,[thai indie],4iS19hLpsgRd8jLPKI4Ni3,Blissonic
27522,[thai indie],3JGC3LkYrwlrTscixVwY72,พราว


                                     SESSIONS


Unnamed: 0,event_type,session_id,timestamp,track_id,user_id
0,PLAY,124,2022-04-19T10:14:08,2FPjk7EjEHD4qgLSSnsWEL,101.0
1,PLAY,124,2022-04-19T10:18:17.973000,6yUmkCTAkHECG4btrYw3cM,101.0
2,SKIP,124,2022-04-19T10:19:03.410000,6yUmkCTAkHECG4btrYw3cM,101.0
3,ADVERTISEMENT,124,2022-04-19T10:19:03.410000,,101.0
4,BUY_PREMIUM,124,2022-04-19T10:19:20.410000,,
...,...,...,...,...,...
3800,PLAY,794,2023-02-19T04:44:21.306000,1A9dNiCCSsiDkIj1RsqMvL,150.0
3801,PLAY,794,2023-02-19T04:47:51.453000,11lkONbH7vsMZEVy012slM,150.0
3802,PLAY,794,2023-02-19T04:51:02.119000,2aMsfiqLC8bMHT6FrrGWY4,150.0
3803,SKIP,794,2023-02-19T04:52:58.495000,2aMsfiqLC8bMHT6FrrGWY4,


                                  TRACK_STORAGE


Unnamed: 0,daily_cost,storage_class,track_id
0,0.003752,SLOW,708ZiYL3ydBWHS2a7gvJB3
1,0.014561,SLOW,48SFtLr5URCI97X2Ynfdnc
2,0.008304,SLOW,1y0U0HAe5QfTRzOsz74bOt
3,0.012207,SLOW,2TlbZ8JhF9ORa7lJylxABw
4,0.011799,SLOW,7ij5kN8jwXr8fZD54M0xb6
...,...,...,...
129643,0.010739,SLOW,59nszNIEDpnOS0prsKudPb
129644,0.010683,SLOW,0xiHNGGiSfrFfOJZGpxpJY
129645,0.010423,SLOW,4peXvhLT61oP9leXdPQ36B
129646,0.010741,SLOW,2pS2IdtMXpvaEONreUlSAo


                                      TRACKS


                                                                                

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,id,id_artist,instrumentalness,key,liveness,loudness,name,popularity,release_date,speechiness,tempo,valence
0,0.8390,0.740,75040,0.891,0,,,0.000000,7,0.8690,-7.480,031 - Der Schatz im Silbersee I - Teil 39,13.0,1968-09-11,0.8920,51.496,0.557
1,0.6950,0.603,291227,0.517,0,48SFtLr5URCI97X2Ynfdnc,2yTUYhIf8fxptTIy3KLuJD,0.000003,6,0.7440,-8.504,Par Avion (Live) ( 2014 - Remaster) - Live; 20...,0.0,2014,0.0235,96.181,0.327
2,0.9530,0.313,166080,0.116,0,1y0U0HAe5QfTRzOsz74bOt,338mC0yGyX0C9of8QMJ5hK,0.331000,0,0.1610,-12.645,My Foolish Heart,25.0,1950-01-01,0.0319,74.071,0.255
3,0.1670,0.958,244133,0.635,0,2TlbZ8JhF9ORa7lJylxABw,5A4ExW2nMBFRy2JDoYUcUE,0.000000,11,0.3620,-7.853,Kathysterisi,14.0,1998,0.2590,108.024,0.866
4,0.1200,0.684,235974,0.839,0,7ij5kN8jwXr8fZD54M0xb6,48CUA59SDed3IdCctKndud,0.000000,4,0.3540,-6.457,Aleni Aleni,51.0,2015,0.0658,128.051,0.580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129643,0.4110,0.633,214773,0.345,0,59nszNIEDpnOS0prsKudPb,6wcIBaOvA9XNGgPujYZZ7L,0.000028,4,0.3610,-15.231,最真的夢,16.0,1990-02-05,0.0291,132.691,0.368
129644,0.2220,0.295,213667,0.417,0,0xiHNGGiSfrFfOJZGpxpJY,04u3fc37nHFKN7GJTSIwI8,0.000006,6,0.1480,-8.002,By My Side,61.0,2017-08-11,0.0307,64.687,0.135
129645,0.6720,0.347,208467,0.216,0,4peXvhLT61oP9leXdPQ36B,4etuCZVdP8yiNPn4xf0ie5,0.000118,8,0.0738,-15.215,Cu Cu Rru Cu Cu Paloma,49.0,1978,0.0315,108.566,0.478
129646,0.0229,0.784,214827,0.821,0,2pS2IdtMXpvaEONreUlSAo,6IE6z7DcZIT4Ml3Fh5Ivch,0.000007,0,0.1760,-7.621,No Quiero Saber - 2000 Mix,26.0,1990,0.0423,119.609,0.885


                                      USERS


Unnamed: 0,city,favourite_genres,id,name,premium_user,street,user_id
0,Warszawa,"[motown, soul, regional mexican]",,Marika Pilipczuk,1.0,ul. Księżycowa 31,101
1,Gdynia,"[regional mexican, psychedelic rock, new roman...",,Anita Pioch,1.0,plac Sadowa 527,102
2,Kraków,"[soul, mellow gold, blues rock]",,Jan Gryga,1.0,plac Wyspiańskiego 73/43,103
3,Wrocław,"[permanent wave, post-teen pop, mandopop]",,Ksawery Klus,1.0,ulica Długosza 71/06,104
4,Gdynia,"[metal, new wave, argentine rock]",,Maciej Bandyk,1.0,ul. Rybacka 07,105
5,Kraków,,,Nikodem Kopciuch,1.0,pl. Promienna 59/43,106
6,Poznań,"[europop, folk, tropical]",,Kacper Osojca,1.0,pl. Staszica 343,107
7,Warszawa,"[new wave, psychedelic rock, soft rock]",,Maurycy Szoka,1.0,al. Tęczowa 332,108
8,Szczecin,"[roots rock, latin pop, alternative metal]",,Sebastian Molka,1.0,al. Armii Krajowej 564,109
9,Kraków,"[lounge, hoerspiel, album rock]",,Filip Kalinka,1.0,aleja Bema 889,110


In [84]:
for view, data_frame in DATA_FRAMES:
    show_table_name(view)
    for column, type in data_frame.dtypes:
        show_column_name(column)
        group_by_column = f"""--sql
            SELECT 
                {column},
                COUNT(*) AS length
            FROM {view}
            GROUP BY {column}
            ORDER BY {column} IS NULL DESC, length DESC, {column} NULLS FIRST
        """
        df = spark.sql(group_by_column)
        display(df.toPandas())

        count_distinct = f"""--sql
            SELECT
                COUNT(DISTINCT {column})
            FROM {view}
        """
        df = spark.sql(count_distinct)
        display(df.toPandas())

                                     ARTISTS
GENRES


Unnamed: 0,genres,length
0,,1352
1,[indonesian pop],74
2,[classic thai pop],68
3,[thai pop],60
4,[classic turkish pop],57
...,...,...
13066,[yiddish folk],1
13067,[yoga],1
13068,[yugoslav new wave],1
13069,[zhongguo feng],1


Unnamed: 0,count(DISTINCT genres)
0,13070


ID


Unnamed: 0,id,length
0,-1,1371
1,0001wHqxbF2YYRQxGdbyER,1
2,000p4jMMhpEHq1h6PFCyO1,1
3,001aJOc7CSQVo3XzoLG4DK,1
4,0027wHZDQXpRll4ckwDGad,1
...,...,...
26149,7zup4xIPjtv50lM7x3n4qW,1
26150,7zw8gWmNncuk2QZHIc70So,1
26151,7zwF847GE2hY5ApGSOLmBG,1
26152,7zwiFdY90oXzLh1Wz22oEq,1


Unnamed: 0,count(DISTINCT id)
0,26154


NAME


Unnamed: 0,name,length
0,TNT,4
1,Kali,3
2,Sebastian,3
3,Akcent,2
4,Alice,2
...,...,...
27411,黃韻玲,1
27412,黑豹,1
27413,龍飄飄,1
27414,龔秋霞,1


Unnamed: 0,count(DISTINCT name)
0,27416


                                     SESSIONS
EVENT_TYPE


Unnamed: 0,event_type,length
0,,181
1,PLAY,2157
2,SKIP,795
3,LIKE,595
4,BUY_PREMIUM,49
5,ADVERTISEMENT,28


Unnamed: 0,count(DISTINCT event_type)
0,5


SESSION_ID


Unnamed: 0,session_id,length
0,304,37
1,302,31
2,326,29
3,679,29
4,721,29
...,...,...
617,773,1
618,782,1
619,786,1
620,791,1


Unnamed: 0,count(DISTINCT session_id)
0,622


TIMESTAMP


Unnamed: 0,timestamp,length
0,2022-03-28T18:05:22.260000,2
1,2022-03-28T18:06:58.255000,2
2,2022-03-28T18:26:22.362000,2
3,2022-03-29T15:45:55.903000,2
4,2022-03-31T03:28:25.326000,2
...,...,...
2972,2023-03-28T07:17:20.214000,1
2973,2023-03-28T07:18:17.652000,1
2974,2023-03-28T07:19:51.547000,1
2975,2023-03-28T07:27:49.972000,1


Unnamed: 0,count(DISTINCT timestamp)
0,2977


TRACK_ID


Unnamed: 0,track_id,length
0,,200
1,,76
2,18mSX3KXGDCkrHDT5gmZTY,5
3,25iHbm3dv9BYhW7sQWbMg9,5
4,4qCYYhzI5bCz7JxV7VD4HH,5
...,...,...
2185,7y5x64GInqHxe7e2LXOfay,1
2186,7yVrrN3wUi6xKOsMjddCic,1
2187,7ycMIXNZZoMgtsZAGK5QEw,1
2188,7z9Ez0NdQESqdAjpvdpIcW,1


Unnamed: 0,count(DISTINCT track_id)
0,2189


USER_ID


Unnamed: 0,user_id,length
0,,183
1,147.0,151
2,106.0,141
3,114.0,138
4,120.0,125
5,149.0,124
6,141.0,118
7,143.0,113
8,119.0,112
9,130.0,103


Unnamed: 0,count(DISTINCT user_id)
0,50


                                  TRACK_STORAGE
DAILY_COST


Unnamed: 0,daily_cost,length
0,0.009600,44
1,0.011700,41
2,0.008000,39
3,0.010000,39
4,0.010800,38
...,...,...
47433,0.229282,1
47434,0.236263,1
47435,0.239629,1
47436,0.239863,1


Unnamed: 0,count(DISTINCT daily_cost)
0,47438


STORAGE_CLASS


Unnamed: 0,storage_class,length
0,SLOW,128369
1,MEDIUM,1275
2,FAST,4


Unnamed: 0,count(DISTINCT storage_class)
0,3


TRACK_ID


Unnamed: 0,track_id,length
0,000jBcNljWTnyjB4YO7ojf,1
1,000u1dTg7y1XCDXi80hbBX,1
2,0017A6SJgTbfQVU2EtsPNo,1
3,001UI3J6PKAEnBgqrwGGQC,1
4,001gx41rQo0bKh063TrC1I,1
...,...,...
129643,7zyVHUVIXfHbsYdJeKzuOn,1
129644,7zyfs9tucIQUDqU9XFyePO,1
129645,7zysi0YkDNAzlKWpBgTowz,1
129646,7zzZmpw8L66ZPjH1M6qmOs,1


Unnamed: 0,count(DISTINCT track_id)
0,129648


                                      TRACKS
ACOUSTICNESS


Unnamed: 0,acousticness,length
0,0.99500,525
1,0.99400,426
2,0.99300,355
3,0.99200,317
4,0.99100,312
...,...,...
4535,0.00853,1
4536,0.00868,1
4537,0.00926,1
4538,0.00960,1


Unnamed: 0,count(DISTINCT acousticness)
0,4540


DANCEABILITY


Unnamed: 0,danceability,length
0,0.629,359
1,0.565,350
2,0.549,348
3,0.652,348
4,0.611,345
...,...,...
1023,0.980,1
1024,0.982,1
1025,0.984,1
1026,0.985,1


Unnamed: 0,count(DISTINCT danceability)
0,1028


DURATION_MS


Unnamed: 0,duration_ms,length
0,192000,44
1,234000,41
2,160000,39
3,200000,39
4,224000,39
...,...,...
46735,4585640,1
46736,4725264,1
46737,4792587,1
46738,4797258,1


Unnamed: 0,count(DISTINCT duration_ms)
0,46740


ENERGY


Unnamed: 0,energy,length
0,0.5380,230
1,0.4990,227
2,0.6340,217
3,0.4840,212
4,0.7160,211
...,...,...
1873,0.0920,1
1874,0.0957,1
1875,0.0960,1
1876,0.0987,1


Unnamed: 0,count(DISTINCT energy)
0,1878


EXPLICIT


Unnamed: 0,explicit,length
0,0,124929
1,1,4719


Unnamed: 0,count(DISTINCT explicit)
0,2


ID


Unnamed: 0,id,length
0,,6530
1,000jBcNljWTnyjB4YO7ojf,1
2,000u1dTg7y1XCDXi80hbBX,1
3,0017A6SJgTbfQVU2EtsPNo,1
4,001UI3J6PKAEnBgqrwGGQC,1
...,...,...
123114,7zyVHUVIXfHbsYdJeKzuOn,1
123115,7zyfs9tucIQUDqU9XFyePO,1
123116,7zysi0YkDNAzlKWpBgTowz,1
123117,7zzZmpw8L66ZPjH1M6qmOs,1


Unnamed: 0,count(DISTINCT id)
0,123118


ID_ARTIST


Unnamed: 0,id_artist,length
0,,6504
1,3meJIgRw7YleJrmbpbJK6S,1057
2,0i38tQX5j4gZ0KS3eCMoIl,549
3,1l6d0RIxTL3JytlLGvWzYe,446
4,3t2iKODSDyzoDJw7AsD99u,437
...,...,...
26861,7zjX652bWyemXyFFVhBnch,1
26862,7zlWN2A8mV2thjdvAyMrEJ,1
26863,7zmk5lkmCMVvfvwF3H8FWC,1
26864,7zpw4vmlZNCUlwbdnFwxwO,1


Unnamed: 0,count(DISTINCT id_artist)
0,26865


INSTRUMENTALNESS


Unnamed: 0,instrumentalness,length
0,0.000000,46190
1,0.000010,83
2,0.897000,74
3,0.000012,73
4,0.000104,72
...,...,...
5392,0.099100,1
5393,0.099900,1
5394,0.993000,1
5395,0.994000,1


Unnamed: 0,count(DISTINCT instrumentalness)
0,5397


KEY


Unnamed: 0,key,length
0,0,16686
1,7,16466
2,9,15219
3,2,15118
4,5,11655
5,4,11090
6,11,8781
7,1,8522
8,10,7921
9,8,7182


Unnamed: 0,count(DISTINCT key)
0,12


LIVENESS


Unnamed: 0,liveness,length
0,0.1110,1209
1,0.1080,1178
2,0.1100,1164
3,0.1070,1116
4,0.1090,1113
...,...,...
1735,0.0239,1
1736,0.0250,1
1737,0.0262,1
1738,0.0284,1


Unnamed: 0,count(DISTINCT liveness)
0,1740


LOUDNESS


Unnamed: 0,loudness,length
0,-8.026,36
1,-5.797,32
2,-7.679,28
3,-7.338,26
4,-12.502,25
...,...,...
20356,2.534,1
20357,2.639,1
20358,2.695,1
20359,3.273,1


Unnamed: 0,count(DISTINCT loudness)
0,20361


NAME


Unnamed: 0,name,length
0,,6547
1,Hold On,40
2,Home,21
3,Summertime,21
4,99 Year Blues,19
...,...,...
108888,중독,1
108889,천일동안 For Thousand Days,1
108890,"텐데...Timeless (Sung by JAEHYUN, DOYOUNG, TAEIL)",1
108891,한 남자,1


Unnamed: 0,count(DISTINCT name)
0,108892


POPULARITY


Unnamed: 0,popularity,length
0,,6469
1,0.0,4255
2,35.0,2919
3,36.0,2859
4,23.0,2839
...,...,...
91,89.0,2
92,91.0,1
93,92.0,1
94,97.0,1


Unnamed: 0,count(DISTINCT popularity)
0,95


RELEASE_DATE


Unnamed: 0,release_date,length
0,1998-01-01,750
1,1997-01-01,738
2,1998,720
3,1995,718
4,1996,692
...,...,...
14936,2021-03-23,1
14937,2021-03-27,1
14938,2021-03-28,1
14939,2021-04-03,1


Unnamed: 0,count(DISTINCT release_date)
0,14941


SPEECHINESS


Unnamed: 0,speechiness,length
0,0.0315,531
1,0.0312,514
2,0.0310,510
3,0.0308,502
4,0.0309,501
...,...,...
1632,0.8040,1
1633,0.8240,1
1634,0.8470,1
1635,0.9680,1


Unnamed: 0,count(DISTINCT speechiness)
0,1637


TEMPO


Unnamed: 0,tempo,length
0,0.000,48
1,139.980,29
2,119.996,22
3,127.997,22
4,130.022,22
...,...,...
70580,233.013,1
70581,236.134,1
70582,238.895,1
70583,239.906,1


Unnamed: 0,count(DISTINCT tempo)
0,70585


VALENCE


Unnamed: 0,valence,length
0,0.9610,614
1,0.9620,536
2,0.9630,469
3,0.9640,445
4,0.9600,387
...,...,...
1623,0.0888,1
1624,0.0891,1
1625,0.0919,1
1626,0.0939,1


Unnamed: 0,count(DISTINCT valence)
0,1628


                                      USERS
CITY


Unnamed: 0,city,length
0,Warszawa,13
1,Kraków,8
2,Radom,8
3,Gdynia,7
4,Poznań,6
5,Wrocław,6
6,Szczecin,2


Unnamed: 0,count(DISTINCT city)
0,7


FAVOURITE_GENRES


Unnamed: 0,favourite_genres,length
0,,1
1,"[adult standards, alternative metal, album rock]",1
2,"[adult standards, mpb, funk]",1
3,"[alternative rock, adult standards, pop]",1
4,"[alternative rock, alternative metal, vocal jazz]",1
5,"[alternative rock, permanent wave, latin pop]",1
6,"[blues rock, lounge, post-teen pop]",1
7,"[c-pop, motown, tropical]",1
8,"[classic rock, new romantic, latin alternative]",1
9,"[classic rock, pop rock, soft rock]",1


Unnamed: 0,count(DISTINCT favourite_genres)
0,49


ID


Unnamed: 0,id,length
0,,45
1,-1.0,5


Unnamed: 0,count(DISTINCT id)
0,1


NAME


Unnamed: 0,name,length
0,Adrianna Golak,1
1,Albert Brzeźniak,1
2,Andrzej Doktor,1
3,Anita Pioch,1
4,Anna Maria Ignatiuk,1
5,Arkadiusz Krzywoń,1
6,Bartek Garczyk,1
7,Blanka Szklarek,1
8,Borys Matula,1
9,Cezary Getka,1


Unnamed: 0,count(DISTINCT name)
0,50


PREMIUM_USER


Unnamed: 0,premium_user,length
0,,2
1,1.0,48


Unnamed: 0,count(DISTINCT premium_user)
0,1


STREET


Unnamed: 0,street,length
0,al. Armii Krajowej 564,1
1,al. Jesionowa 47,1
2,al. Konwaliowa 33,1
3,al. Okrzei 69,1
4,al. Podleśna 00,1
5,al. Szeroka 27/38,1
6,al. Tęczowa 332,1
7,aleja Bema 889,1
8,aleja Prusa 830,1
9,aleja Stolarska 554,1


Unnamed: 0,count(DISTINCT street)
0,50


USER_ID


Unnamed: 0,user_id,length
0,101,1
1,102,1
2,103,1
3,104,1
4,105,1
5,106,1
6,107,1
7,108,1
8,109,1
9,110,1


Unnamed: 0,count(DISTINCT user_id)
0,50


In [88]:
def aggregate_numeric_column(view: str, column: str) -> str:
    return f"""--sql
            SELECT
                "{column}" AS name,
                COUNT({column}) AS count,
                MIN({column}) AS min,
                MAX({column}) AS max,
                AVG({column}) AS average,
                SUM({column}) AS sum,
                SUM(DISTINCT {column}) AS sumDistinct,
                KURTOSIS({column}) AS kurtosis,
                SKEWNESS({column}) AS skewness,
                STDDEV({column}) AS standard_deviation,
                STDDEV_POP({column}) AS population_standard_deviation,
                VARIANCE({column}) AS variance,
                VAR_POP({column}) AS population_variance
            FROM {view}
            WHERE {column} IS NOT NULL
        """

for view, data_frame in DATA_FRAMES:
    show_table_name(view)
    for column, type in data_frame.dtypes:
        if type in ['double', 'bigint']:
            show_column_name(column)
            df = spark.sql(aggregate_numeric_column(view, column))
            display(df.toPandas())

                                     ARTISTS
                                     SESSIONS
SESSION_ID


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,session_id,3805,124,794,461.138765,1754633,285854,-1.280354,0.018784,197.815899,197.789903,39131.130056,39120.845922


USER_ID


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,user_id,3622,101,150,126.139978,456879,6275,-1.331799,-0.042814,14.900369,14.898312,222.020997,221.959699


                                  TRACK_STORAGE
DAILY_COST


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,daily_cost,129648,0.000167,0.249754,0.011535,1495.508148,591.933795,259.234276,10.35695,0.005815,0.005815,3.4e-05,3.4e-05


                                      TRACKS
ACOUSTICNESS


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,acousticness,129648,0.0,0.996,0.41755,54134.576468,546.440307,-1.383039,0.250805,0.335652,0.335651,0.112662,0.112661


DANCEABILITY


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,danceability,129648,0.0,0.988,0.564894,73237.4093,491.2168,-0.258259,-0.28432,0.159114,0.159113,0.025317,0.025317


DURATION_MS


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,duration_ms,129648,3344,4995083,228526.632274,29628020821,11430854470,281.491889,10.884919,113801.507474,113801.068587,12950780000.0,12950680000.0


ENERGY


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,energy,129648,0.0,1.0,0.562776,72962.72439,543.752618,-0.899073,-0.168391,0.241957,0.241956,0.058543,0.058543


EXPLICIT


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,explicit,129648,0,1,0.036399,4719,1,22.511391,4.950898,0.18728,0.18728,0.035074,0.035074


INSTRUMENTALNESS


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,instrumentalness,129648,0.0,1.0,0.086754,11247.463381,549.236231,6.200105,2.759591,0.232285,0.232284,0.053956,0.053956


KEY


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,key,129648,0,11,5.242873,679728,66,-1.265013,-0.011349,3.518889,3.518876,12.382581,12.382485


LIVENESS


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,liveness,129648,0.0,0.999,0.21406,27752.50933,543.09323,4.380976,2.072202,0.186901,0.1869,0.034932,0.034932


LOUDNESS


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,loudness,129648,-60.0,4.362,-9.734177,-1262016.64,-252312.279,2.778514,-1.104693,4.5213,4.521283,20.442158,20.442


POPULARITY


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,popularity,123179,0,99,29.677981,3655704,4474,-0.481352,0.22448,17.129474,17.129405,293.418896,293.416514


SPEECHINESS


                                                                                

Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,speechiness,129648,0.0,0.969,0.095068,12325.3914,503.1898,16.456687,4.045176,0.166167,0.166166,0.027611,0.027611


TEMPO


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,tempo,129648,0.0,243.507,119.53864,15497950.0,8607442.191,-0.106043,0.402869,29.653393,29.653278,879.323707,879.316925


VALENCE


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,valence,129648,0.0,1.0,0.563443,73049.2694,537.05768,-1.035815,-0.154964,0.252581,0.25258,0.063797,0.063796


                                      USERS
ID


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,id,5,-1,-1,-1.0,-5,-1,,,0.0,0.0,0.0,0.0


USER_ID


Unnamed: 0,column,count,min,max,average,sum,sumDistinct,kurtosis,skewness,standard_deviation,population_standard_deviation,variance,population_variance
0,user_id,50,101,150,125.5,6275,6275,-1.20096,2.542155e-16,14.57738,14.43087,212.5,208.25


In [92]:
def explode_column(view: str, column: str) -> str:
    return f"""--sql
            SELECT
                DISTINCT EXPLODE({column}) AS distinct_{column}
            FROM {view}
            ORDER BY distinct_{column} NULLS FIRST
        """


def count_exploded_column(view: str, column: str) -> str:
    exploded = f"""--sql
        SELECT
            DISTINCT EXPLODE({column}) AS {column}
        FROM {view}
    """

    return f"""--sql
            SELECT
                COUNT(*) AS length
            FROM ({exploded})
        """

for view, data_frame in DATA_FRAMES:
    show_table_name(view)
    for column, type in data_frame.dtypes:
        if type.startswith('array'):
            show_column_name(column)
            df = spark.sql(explode_column(view, column))
            display(df.toPandas())
            df = spark.sql(count_exploded_column(view, column))
            display(df.toPandas())

                                     ARTISTS
GENRES


Unnamed: 0,distinct_genres
0,48g
1,a cappella
2,abstract
3,abstract hip hop
4,accordeon
...,...
3867,zolo
3868,zouglou
3869,zouk
3870,zouk riddim


Unnamed: 0,length
0,3872


                                     SESSIONS
                                  TRACK_STORAGE
                                      TRACKS
                                      USERS
FAVOURITE_GENRES


Unnamed: 0,distinct_favourite_genres
0,adult standards
1,album rock
2,alternative metal
3,alternative rock
4,argentine rock
5,art rock
6,blues rock
7,brill building pop
8,c-pop
9,classic rock


Unnamed: 0,length
0,46
