## **Data Exploration and Cleaning**

In [1]:
# Importing packages 
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df= pd.read_csv('songs_normalize.csv')

# List columnn names
print(df.columns) # 18 fields

# Checking for missing values 
df.isnull().sum().sort_values(ascending=False)

Index(['artist', 'song', 'duration_ms', 'explicit', 'year', 'popularity',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'genre'],
      dtype='object')


artist              0
song                0
tempo               0
valence             0
liveness            0
instrumentalness    0
acousticness        0
speechiness         0
mode                0
loudness            0
key                 0
energy              0
danceability        0
popularity          0
year                0
explicit            0
duration_ms         0
genre               0
dtype: int64

No missing values were present in the dataset. Therefore, there is no need for imputing missing values. 

In [3]:
# Number of rows
len(df) # 2,000 observations present in our dataset 

2000

In [4]:
# Removing duplciate rows
dataframe_filtered = df.drop_duplicates()
duplicate_rows = len(df) - (len(dataframe_filtered)) 
print(duplicate_rows)

59


59 duplicated rows are present. If there is a machine learning algorithm applied these rows could skew the results and introduce potential bias. 

In [5]:
# skim through the data
#for column in dataframe_1.columns:
 #   print(f"\nColumn: {column}")
  #  print(dataframe_1[column].unique())

# First 6 rows
dataframe_filtered.head()

print(df.dtypes)

artist               object
song                 object
duration_ms           int64
explicit               bool
year                  int64
popularity            int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
genre                object
dtype: object


There seems to be no issue with the data types in the data frame as they seem to match exactly with the data types for each column.

In [6]:
# Summary statistics on data frame
dataframe_filtered.describe()

Unnamed: 0,duration_ms,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
count,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0,1941.0
mean,228594.973725,2009.52035,59.633179,0.667814,0.721549,5.369397,-5.514082,0.553323,0.103783,0.128173,0.015372,0.181726,0.552966,120.158442
std,39249.796103,5.875532,21.501053,0.140608,0.152872,3.61527,1.93895,0.497277,0.096148,0.172584,0.088371,0.14091,0.220845,26.990475
min,113000.0,1998.0,0.0,0.129,0.0549,0.0,-20.514,0.0,0.0232,1.9e-05,0.0,0.0215,0.0381,60.019
25%,203506.0,2004.0,56.0,0.581,0.624,2.0,-6.49,0.0,0.0397,0.0135,0.0,0.0884,0.39,98.986
50%,223186.0,2010.0,65.0,0.676,0.739,6.0,-5.285,1.0,0.061,0.0558,0.0,0.124,0.56,120.028
75%,247946.0,2015.0,73.0,0.765,0.84,8.0,-4.168,1.0,0.129,0.176,6.9e-05,0.242,0.731,134.199
max,484146.0,2020.0,89.0,0.975,0.999,11.0,-0.276,1.0,0.576,0.976,0.985,0.853,0.973,210.851
