In [None]:
!pygmentize -l text data/itunes_data.csv

In [None]:
import pandas as pd

In [None]:
csv_df = pd.read_csv('data/itunes_data.csv')
excel_df = pd.read_excel('data/itunes_data.xlsx', engine='openpyxl')

from sqlalchemy import create_engine
engine = create_engine('sqlite:///data/chinook.db')
query = """SELECT tracks.name as Track, tracks.composer, tracks.milliseconds,
tracks.bytes, tracks.unitprice,
genres.name as Genre,
albums.title as Album,
artists.name as Artist
FROM tracks
JOIN genres ON tracks.genreid = genres.genreid
JOIN albums ON tracks.albumid = albums.albumid
JOIN artists ON albums.artistid = artists.artistid;
"""
with engine.connect() as connection:
    sql_df = pd.read_sql_query(query, connection)

In [None]:
print(sql_df.columns, csv_df.columns, excel_df.columns) 

In [None]:
itunes_df = pd.concat([csv_df, excel_df, sql_df]) #combine DataFrames into one
print(itunes_df.shape) # dimension of dataframe

In [None]:
itunes_df.head() #to look at the top of the data

In [None]:
itunes_df.info() #info about the data

In [None]:
itunes_df.isna().sum() #gives us the counts of missing values 

In [None]:
itunes_df.describe() #statistical properties of the data

In [None]:
itunes_df['Genre'].value_counts()

In [None]:
itunes_df['Artist'].unique() # return an array of all unique values

In [None]:
import matplotlib.pyplot as plt
itunes_df['Minutes'] = itunes_df['Milliseconds']/60000
itunes_df['Minutes'].describe()
itunes_df['Minutes'].hist(bins=20)
plt.show()
itunes_df['Minutes'].describe()

In [None]:
itunes_df.plot.scatter(x='Minutes', y='Bytes')
plt.show()

In [None]:
itunes_df['Genre'].value_counts().plot.bar()
plt.show()

### Cleaning Data

In [None]:
print(itunes_df[itunes_df['Minutes'] > 20]['Genre'].unique()) #genres longer than 20 mins
itunes_df[(itunes_df['Minutes'] > 26) & (itunes_df['Genre'] == 'Rock')]['Minutes'].min()
only_music = itunes_df[~itunes_df['Genre'].isin(['Drama', 'TV Shows', 'Sci Fi & Fantasy', 'Science Fiction', 'Comedy'])]
only_music.shape

In [None]:
unknown_composer = only_music[only_music['Composer'].isna()].copy()
print(unknown_composer.shape)
only_music.dropna(inplace=True, subset= 'Composer')
print(only_music.shape)
unknown_composer.loc[unknown_composer['Composer'].isna(), 'Composer'] = 'Unknown'
unknown_composer

In [None]:
### KNN (k-nearest neighbors) imputation
import numpy as np
itunes_df.loc[0, 'Bytes'] = np.nan
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputed = imputer.fit_transform(itunes_df [['Milliseconds', 'Bytes', 
'UnitPrice']])
itunes_df['Bytes'] = imputed[:, 1]

In [None]:
def remove_outliers(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    upper_boundary = q3 + 1.5 * iqr
    lower_boundary = q1 - 1.5 * iqr
    new_df = df.loc[(df[column] > lower_boundary) & \
                        (df[column] < upper_boundary)]
    return new_df
itunes_df_clean = remove_outliers(itunes_df, 'Milliseconds')