# Exploratory Data Analysis

## Setup

In [1]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
os.getcwd()

'/Users/brianrice/dev/2021-msia423-rice-brian-project/notebooks/develop'

In [3]:
S3_BUCKET = "s3://2021-msia423-rice-brian"
S3_RAW_PATH = "raw/P4KxSpotify.csv"
S3_CLEANED_PATH = "cleaned/P4KxSpotify.csv"

IN_PATH = os.path.join(S3_BUCKET, S3_RAW_PATH)
OUT_PATH = os.path.join(S3_BUCKET, S3_CLEANED_PATH)

## Data exploration

In [4]:
data = pd.read_csv(IN_PATH)

In [5]:
data.head(10)

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Studio 1,Studio 1,Andy Battaglia,8.5,2009.0,February 18 2009,Studio,Electronic,0.511917,0.499667,5.25,-5.626583,0.031983,0.724917,0.024493,0.165367,0.555083,101.395167
1,John Fahey,The Great Santa Barbara Oil Slick,Mark Richardson,8.2,2005.0,February 13 2005,Water,Folk/Country,0.369765,0.325412,4.470588,-19.153824,0.148624,0.647053,0.559133,0.527782,0.179465,107.622647
2,Reigning Sound,Too Much Guitar,Stephen M. Deusner,8.3,2004.0,August 19 2004,In the Red,Electronic,0.253943,0.912857,4.428571,-1.0895,0.0555,0.000253,0.751214,0.199071,0.552786,133.8955
3,The Red Thread,After the Last,Chris Dahlen,7.3,2003.0,July 17 2003,Badman,Rock,0.4254,0.433474,5.7,-12.871,0.02826,0.310325,0.224137,0.12515,0.4514,104.3542
4,Mac Miller,Swimming,Evan Rytlewski,7.5,2018.0,August 3 2018,Warner Bros.,Rap,0.624846,0.438154,4.153846,-9.456077,0.170246,0.652462,0.012819,0.121131,0.281138,122.121308
5,French Kicks,Swimming,Roque Strew,7.6,2008.0,August 27 2008,Vagrant,Electronic,0.624846,0.438154,4.153846,-9.456077,0.170246,0.652462,0.012819,0.121131,0.281138,122.121308
6,Jessica Lea Mayfield,With Blasphemy So Heartfelt,Stephen M. Deusner,8.2,2008.0,November 11 2008,Polymer,Rock,0.447583,0.399083,6.5,-8.402,0.032267,0.69475,0.004033,0.123142,0.237933,111.549917
7,The LK,The LK vs. the Snow,Joe Tangari,7.9,,March 14 2008,Kora,Rock,0.643273,0.714636,6.363636,-7.303182,0.048245,0.200055,0.008306,0.211927,0.704545,121.010546
8,The Essex Green,Essex Green EP,Matt LeMay,8.1,2000.0,May 31 2000,Parasol,Rock,0.729,0.932,5.0,-3.038,0.0623,0.00227,0.197,0.139,0.283,127.988
9,Grateful Dead,The Grateful Dead: 50th Anniversary Deluxe Edi...,Sam Sodomsky,6.9,2017.0,January 19 2017,Rhino,Rock,0.4294,0.429,4.75,-12.00925,0.048135,0.592355,0.300195,0.180605,0.503805,94.168


In [6]:
data.dtypes

artist               object
album                object
reviewauthor         object
score               float64
releaseyear         float64
reviewdate           object
recordlabel          object
genre                object
danceability        float64
energy              float64
key                 float64
loudness            float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
dtype: object

### Audio feature interpretation

[Source](https://developer.spotify.com/documentation/web-api/reference/)

- `danceability`
Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable. (Float)

- `energy`
Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy. (Float)

- `key`
The key the track is in. Integers map to pitches using standard Pitch Class notation . E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. (Integer)

- `loudness`
The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db. (Float)

- `acousticness`
A confidence measure from 0.0 to 1.0 of whether the track is acoustic. 1.0 represents high confidence the track is acoustic. (Float)

- `speechiness`
Speechiness detects the presence of spoken words in a track. The more exclusively speech-like the recording (e.g. talk show, audio book, poetry), the closer to 1.0 the attribute value. Values above 0.66 describe tracks that are probably made entirely of spoken words. Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. Values below 0.33 most likely represent music and other non-speech-like tracks. (Float)

- `instrumentalness`
Predicts whether a track contains no vocals. “Ooh” and “aah” sounds are treated as instrumental in this context. Rap or spoken word tracks are clearly “vocal”. The closer the instrumentalness value is to 1.0, the greater likelihood the track contains no vocal content. Values above 0.5 are intended to represent instrumental tracks, but confidence is higher as the value approaches 1.0. (Float)

- `liveness`
Detects the presence of an audience in the recording. Higher liveness values represent an increased probability that the track was performed live. A value above 0.8 provides strong likelihood that the track is live. (Float)

- `valence`
A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry). (Float)

- `tempo`
The overall estimated tempo of a track in beats per minute (BPM). In musical terminology, tempo is the speed or pace of a given piece and derives directly from the average beat duration. (Float)

In [7]:
data.describe()

Unnamed: 0,score,releaseyear,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,18403.0,18078.0,18403.0,18403.0,18403.0,18403.0,18403.0,18403.0,18403.0,18403.0,18403.0,18403.0
mean,7.033255,2009.315411,0.510635,0.600806,5.215836,-9.31839,0.091125,0.302895,0.278616,0.1965,0.404326,120.289697
std,1.25185,7.70871,0.159833,0.204257,1.619708,4.264038,0.092428,0.26373,0.30074,0.105458,0.183464,15.613327
min,0.0,1957.0,-1.0,-1.0,-1.0,-51.72875,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,6.5,2005.0,0.405784,0.471787,4.333333,-11.24835,0.040704,0.089842,0.01006,0.134153,0.2718,111.193425
50%,7.3,2010.0,0.510417,0.624364,5.230769,-8.492333,0.056808,0.229806,0.153433,0.17435,0.405313,120.367385
75%,7.8,2015.0,0.622,0.750586,6.1,-6.416397,0.100029,0.463416,0.506816,0.22889,0.532203,129.098775
max,10.0,2019.0,0.974,0.999,11.0,4.078,0.958,0.996,0.982,0.978,0.971,215.972


### Missing values

In [8]:
data.apply(lambda x: sum(pd.isna(x)))

artist                 1
album                  0
reviewauthor           0
score                  0
releaseyear          325
reviewdate             0
recordlabel           26
genre               1289
danceability           0
energy                 0
key                    0
loudness               0
speechiness            0
acousticness           0
instrumentalness       0
liveness               0
valence                0
tempo                  0
dtype: int64

#### Artist

In [9]:
# Find the row with missing data
data[pd.isna(data["artist"])]

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
4720,,Cellar EP,Kevin Lozano,7.5,2016.0,February 29 2016,Fade to Mind,Electronic,0.769375,0.7285,8.75,-3.59075,0.184388,0.151625,0.2105,0.198875,0.686125,93.503375


The artist really is named "NA" here (ha).

In [10]:
# Replace with corrected value
data["artist"] = data["artist"].where(data["artist"].notna(), other="NA")

In [11]:
# Check that the change went through okay
data[pd.isna(data["artist"])]

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo


#### Release year

In [12]:
# Find the row with missing data
data[pd.isna(data["releaseyear"])]

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
7,The LK,The LK vs. the Snow,Joe Tangari,7.9,,March 14 2008,Kora,Rock,0.643273,0.714636,6.363636,-7.303182,0.048245,0.200055,0.008306,0.211927,0.704545,121.010546
10,The Anomoanon,The Anomoanon,Matt LeMay,7.7,,February 20 2001,Palace,Folk/Country,0.441600,0.391500,5.500000,-10.995200,0.033010,0.632400,0.255044,0.142400,0.375560,126.895400
21,Miles Benjamin Anthony Robinson,Miles Benjamin Anthony Robinson,Joshua Love,7.5,,July 16 2008,Say Hey,Rock,0.430500,0.467000,5.200000,-7.209900,0.033330,0.500300,0.002007,0.178710,0.295720,105.568200
109,Wagon Christ,Musipal,Spencer Owen,8.0,,March 6 2001,Ninja Tune,Electronic,0.677750,0.761500,6.083333,-6.392417,0.096008,0.118804,0.496151,0.275542,0.541417,130.025167
154,Nightmares on Wax,DJ Kicks,Paul Cooper,5.5,,October 3 2000,!K7,Electronic,0.661000,0.587471,7.058824,-15.248647,0.059094,0.100989,0.754471,0.201106,0.575941,136.011294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17843,Anne M√ºller,Heliopause,Andy Beta,6.9,,December 4 2019,none,Experimental,0.328267,0.185633,5.000000,-19.779333,0.050833,0.840667,0.901167,0.091167,0.095483,97.856333
17844,Beck,Hyperspace,Sam Sodomsky,6.5,,December 3 2019,none,Rock,0.541545,0.558818,6.000000,-8.057182,0.064418,0.349591,0.038845,0.126373,0.305391,109.050727
17846,Galcher Lustwerk,Information,Nathan Taylor Pemberton,7.9,,December 3 2019,none,Electronic,0.679667,0.712867,5.866667,-7.372800,0.057353,0.135023,0.393349,0.181960,0.554667,114.068133
17961,Floating Points,Crush,Shawn Reynaldo,8.3,,October 21 2019,none,Electronic,0.910000,0.401000,1.000000,-9.552000,0.085500,0.030200,0.000159,0.080100,0.824000,135.994000


In general, we can assume that the album was released in the same year as the Pitchfork review.

In [13]:
# Convert review date to datetime format
data["reviewdate"] = pd.to_datetime(data["reviewdate"], format="%B %d %Y")

In [14]:
# How often is the proposed imputation method the case?
print("Percentage of reviews where year of album release matches year of review: {:.2f}%".format(
    100 * np.mean(data.loc[~pd.isna(data["releaseyear"]), "releaseyear"] == \
                  data.loc[~pd.isna(data["releaseyear"]), "reviewdate"].dt.year)
))

Percentage of reviews where year of album release matches year of review: 88.57%


In [15]:
# Replace with approximate values
data.loc[pd.isna(data["releaseyear"]), "releaseyear"] = data[pd.isna(data["releaseyear"])].loc[:, "reviewdate"].dt.year

In [16]:
# Check that the change went through okay
data[pd.isna(data["releaseyear"])]

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo


In [17]:
# Keep only the date part (drop the time)
data["reviewdate"] = data["reviewdate"].dt.date

#### Record label

In [18]:
data[pd.isna(data["recordlabel"])]

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
1359,Run the Jewels,Run the Jewels,Nate Patrin,8.5,2013.0,2013-07-01,,Rap,0.686643,0.664929,4.714286,-7.670929,0.234957,0.105507,0.000112,0.253371,0.423071,116.928714
1620,Main Attrakionz,808s and Dark Grapes III,Meaghan Garvey,7.4,2015.0,2015-06-26,,Rap,0.535417,0.700333,4.583333,-8.35025,0.1314,0.164275,0.025315,0.196775,0.450275,137.005833
1757,Lil Wayne,Dedication 2,Peter Macia,8.1,2006.0,2006-06-22,,Rap,0.775,0.554,1.0,-7.737,0.162,0.0508,0.0,0.0925,0.377,143.102
1946,Curren$y,The Drive In Theatre,Jonah Bromwich,7.6,2014.0,2014-03-14,,Rap,0.597267,0.7308,3.133333,-8.857733,0.26432,0.1485,0.000499,0.270127,0.504,109.3584
2382,Lot Six,Animals,Joe Tangari,7.4,2002.0,2002-12-02,,,0.526846,0.700923,5.461538,-6.036,0.038385,0.373499,0.085362,0.159431,0.739077,116.369385
2512,Joey Bada$$,1999,Felipe Delerme,8.0,2012.0,2012-06-26,,Rap,0.628,0.762667,5.2,-4.5722,0.277667,0.38426,0.030335,0.221333,0.730933,103.522067
4139,Rick Ross,Rich Forever,Jordan Sargent,8.2,2012.0,2012-01-13,,Rap,0.781583,0.565583,5.083333,-8.194417,0.32465,0.104936,0.0,0.166633,0.481417,127.001333
4359,A Hawk and a Hacksaw,Cervantine,Paul Thompson,7.8,2011.0,2011-03-09,,Rock,0.432488,0.430125,5.75,-10.760375,0.048475,0.582625,0.479797,0.226362,0.539225,127.611625
4636,Chief Keef,Back From the Dead,Jordan Sargent,7.9,2012.0,2012-04-12,,Rap,0.613167,0.7955,4.083333,-2.2715,0.066,0.050125,0.0,0.261267,0.3214,131.3565
4971,King Louie,Drilluminati,Miles Raymer,6.8,2012.0,2013-01-03,,Rap,0.698357,0.613357,6.5,-9.662643,0.193321,0.119746,0.002847,0.176471,0.348143,124.362429


Some of these can be filled in with a quick search (mostly from Wikipedia)

In [19]:
# Manually enter album record labels (compared with
# raw data to match any existing names)
fill_missing = pd.Series(
    data = [
        "Fool's Gold", # Run the Jewels
        "Vapor", # 808s and Dark Grapes III
        "101 Distribution", # Dedication 2
        "Jet Life", # The Drive In Theatre
        "Espo", # Animals
        "Cinematic", # 1999
        "Def Jam", # Rich Forever
        "LM Dupli-Cation", # Cervantine
        "Glory Boyz", # Back From the Dead
        "Epic", # Drilluminati
        "Self-released", # Community Service 2!
        "Cash Money", # Sorry 4 the Wait
        "Grand Hustle", # Fuck a Mixtape
        "Vice", # Blue Chips
        "Free Bandz", # 56 Nights
        "Six Shooter Records", # Retribution
        "Self-released", # Acid Rap
        "Maybach", # Dreamchasers
        "Self-released", # White Mystery
        "Top Dawg", # Cilvia Demo
        "Triple X", # Winter Hill
        "1017", # 1017 Thug
        "Rostrum", # Kush and Orange Juice
        "BasedWorld", # God's Father
        "10.Deep", # The Mixtape About Nothing
        "Self-released" # Coloring Book
    ],
    index=data[pd.isna(data["recordlabel"])].index
)

In [20]:
# Replace with corrected values
data.loc[pd.isna(data["recordlabel"]), "recordlabel"] = fill_missing

In [21]:
# Trim extra whitespace
data["recordlabel"] = data["recordlabel"].apply(str.strip)

In [22]:
# Check that the change went through okay
data[pd.isna(data["recordlabel"])]

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo


In [23]:
# Standardize "Maybach Music Group" and "Maybach/Warner Bros."" as just "Maybach"
data.loc[data["recordlabel"] == "Maybach Music Group", "recordlabel"] = "Maybach"
data.loc[data["recordlabel"] == "Maybach/Warner Bros.", "recordlabel"] = "Maybach"

In [24]:
# Standardize "Self" (only 1 record) and "none" as "Self-released"
data.loc[data["recordlabel"] == "Self", "recordlabel"] = "Self-released"
data.loc[data["recordlabel"] == "none", "recordlabel"] = "Self-released"

#### Genre

In [25]:
data[pd.isna(data["genre"])]

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
16,Livity Sound,Livity Sound Remixes,Andrew Gaerig,7.4,2014.0,2014-11-06,Livity Sound,,0.680917,0.891750,3.500000,-10.333667,0.072600,0.023079,0.830750,0.146967,0.295017,126.158667
25,Khanate,Things Viral,Chris Dahlen,8.2,2003.0,2004-01-07,Southern Lord,,0.541538,0.607846,4.846154,-6.925923,0.065738,0.235023,0.068799,0.209962,0.640846,126.198461
36,Bodyguard,Silica Gel,Nick Neyland,5.2,2012.0,2012-03-01,Self-released,,0.426167,0.719833,5.166667,-4.573333,0.065767,0.151660,0.157598,0.186192,0.312933,122.995917
40,These Arms Are Snakes,Oxeneers or The Lion Sleeps When Its Antelope ...,David Moore,7.6,2004.0,2004-09-21,Jade Tree,,0.287455,0.737636,5.272727,-8.190545,0.099182,0.087924,0.446919,0.203755,0.365364,134.117182
59,Ital Tek,Nebula Dance,Miles Raymer,7.8,2012.0,2012-11-14,Planet Mu,,0.565083,0.791417,6.583333,-7.490583,0.056983,0.087783,0.782833,0.125592,0.432833,132.239250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17726,7 Worlds Collide,The Sun Came Out,Stephen M. Deusner,6.7,2009.0,2009-09-28,Columbia,,0.533450,0.545930,5.550000,-10.211750,0.037580,0.303893,0.089671,0.156695,0.488755,125.262200
17765,Lieutenant,If I Kill This Thing We're All Going to Eat fo...,Stuart Berman,5.8,2015.0,2015-03-12,Dine Alone,,0.488556,0.664111,3.222222,-6.623778,0.030933,0.148089,0.156008,0.101167,0.390111,117.924667
17768,The Focus Group,The Elektrik Karousel,Nick Neyland,7.0,2013.0,2013-05-13,Ghost Box,,0.383740,0.471450,5.650000,-14.653850,0.072725,0.752810,0.632057,0.276365,0.379990,112.409950
17773,Boardwalk,Boardwalk,Harley Brown,4.8,2013.0,2013-10-24,Stones Throw,,0.663923,0.454615,5.923077,-9.499923,0.102477,0.573231,0.515200,0.231731,0.455385,120.301231


In [26]:
# Percentage of missing genres for each artist
# For artists with at least one recorded genre, we
#   could fill in all other albums with that genre
missing_genres = (data.groupby("artist")
     .agg({"genre": lambda x: np.mean(pd.isna(x))})
     .sort_values(by="genre", ascending=False))

In [27]:
missing_genres[missing_genres["genre"] > 0].shape

(1102, 1)

In [28]:
missing_genres.head()

Unnamed: 0_level_0,genre
artist,Unnamed: 1_level_1
Sonic Liberation Front,1.0
Latyrx,1.0
Perfect Pussy,1.0
Jo Johnson,1.0
Perc,1.0


In [29]:
# Which artists could we actually impute for?
missing_genres[(missing_genres["genre"] > 0) & (missing_genres["genre"] < 1)]

Unnamed: 0_level_0,genre
artist,Unnamed: 1_level_1
The Angels of Light,0.75
Prodigy,0.666667
Richard Dawson,0.666667
Black Twig Pickers,0.5
Have a Nice Life,0.5
Metal Fingers,0.5
Cough,0.5
Scarlett Johansson,0.5
"Slaughter Beach, Dog",0.5
Ghost,0.333333


Okay, that's not very many. Instead, we'll just create a "Missing" category to capture all of these. Some techniques, like decision trees, can still incorporate this information then.

In [30]:
data["genre"] = data["genre"].fillna("Missing")

In [31]:
# Check that the change went through okay
data[pd.isna(data["genre"])]

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo


## Preview after changes

In [32]:
data

Unnamed: 0,artist,album,reviewauthor,score,releaseyear,reviewdate,recordlabel,genre,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Studio 1,Studio 1,Andy Battaglia,8.5,2009.0,2009-02-18,Studio,Electronic,0.511917,0.499667,5.250000,-5.626583,0.031983,0.724917,0.024493,0.165367,0.555083,101.395167
1,John Fahey,The Great Santa Barbara Oil Slick,Mark Richardson,8.2,2005.0,2005-02-13,Water,Folk/Country,0.369765,0.325412,4.470588,-19.153824,0.148624,0.647053,0.559133,0.527782,0.179465,107.622647
2,Reigning Sound,Too Much Guitar,Stephen M. Deusner,8.3,2004.0,2004-08-19,In the Red,Electronic,0.253943,0.912857,4.428571,-1.089500,0.055500,0.000253,0.751214,0.199071,0.552786,133.895500
3,The Red Thread,After the Last,Chris Dahlen,7.3,2003.0,2003-07-17,Badman,Rock,0.425400,0.433474,5.700000,-12.871000,0.028260,0.310325,0.224137,0.125150,0.451400,104.354200
4,Mac Miller,Swimming,Evan Rytlewski,7.5,2018.0,2018-08-03,Warner Bros.,Rap,0.624846,0.438154,4.153846,-9.456077,0.170246,0.652462,0.012819,0.121131,0.281138,122.121308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18398,DJ Khaled,Father of Asahd,Rawiya Kameir,5.4,2019.0,2019-05-22,Self-released,Rap,0.530733,0.763000,5.600000,-4.037933,0.260833,0.138597,0.000002,0.286033,0.465000,109.981200
18399,Interpol,A Fine Mess EP,Brian Josephs,5.6,2019.0,2019-05-22,Self-released,Rock,0.481000,0.938800,2.600000,-1.761400,0.054260,0.000563,0.699800,0.202380,0.522800,124.184800
18400,Injury Reserve,Injury Reserve,Sheldon Pearce,6.8,2019.0,2019-05-22,Self-released,Rap,0.559615,0.709538,5.769231,-6.205846,0.346485,0.182031,0.000229,0.259154,0.559154,106.885769
18401,Operators,Radiant Dawn,Stuart Berman,7.7,2019.0,2019-05-22,Self-released,Rock,0.575929,0.693429,5.071429,-9.977714,0.058193,0.171626,0.192718,0.260207,0.417579,124.177214


## Export cleaned dataset

In [33]:
data.to_csv(OUT_PATH, index=False)