### Exploratory Data Analysis (EDA)

### Loading Data

In [None]:
!pygmentize -l text data/itunes_data.csv

In [None]:
import pandas as pd

In [None]:
csv_df = pd.read_csv('data/itunes_data.csv')
excel_df = pd.read_excel('data/itunes_data.xlsx', engine='openpyxl')

from sqlalchemy import create_engine
engine = create_engine('sqlite:///data/chinook.db')
query = """SELECT tracks.name as Track, tracks.composer, tracks.milliseconds,
tracks.bytes, tracks.unitprice,
genres.name as Genre,
albums.title as Album,
artists.name as Artist
FROM tracks
JOIN genres ON tracks.genreid = genres.genreid
JOIN albums ON tracks.albumid = albums.albumid
JOIN artists ON albums.artistid = artists.artistid;
"""
with engine.connect() as connection:
    sql_df = pd.read_sql_query(query, connection)

In [None]:
print(sql_df.columns, csv_df.columns, excel_df.columns) 

In [None]:
itunes_df = pd.concat([csv_df, excel_df, sql_df]) #combine DataFrames into one
print(itunes_df.shape) # dimension of dataframe

In [None]:
itunes_df.head() #to look at the top of the data

In [None]:
itunes_df.info() #info about the data

In [None]:
itunes_df.isna().sum() #gives us the counts of missing values 

In [None]:
itunes_df.describe() #statistical properties of the data

In [None]:
itunes_df['Genre'].value_counts()

In [None]:
itunes_df['Artist'].unique() # return an array of all unique values

In [None]:
import matplotlib.pyplot as plt
itunes_df['Milliseconds'].describe()
itunes_df['Milliseconds'].hist(bins=20)
plt.show()

In [None]:
itunes_df['Milliseconds'].describe() #get statistics

In [None]:
itunes_df.plot.scatter(x='Milliseconds', y='Bytes')
plt.show()

In [None]:
itunes_df['Genre'].value_counts().plot.bar()
plt.show()

### Cleaning & Filtering Data

In [None]:
# Ensuring datatypes are correct
print(itunes_df.shape)
itunes_df['Milliseconds'] = itunes_df['Milliseconds'].astype('int')
print(itunes_df.shape)

In [None]:
# milliseconds converted minutes and added as column to df
itunes_df['Minutes'] = itunes_df['Milliseconds']/60000 
print(itunes_df.shape)

In [None]:
# only music genre is filtered
print(itunes_df[itunes_df['Minutes'] > 20]['Genre'].unique()) #show genres longer than 20 mins
itunes_df[(itunes_df['Minutes'] > 26) & (itunes_df['Genre'] == 'Rock')]['Minutes'].min()
only_music = itunes_df[~itunes_df['Genre'].isin(['Drama', 'TV Shows', 'Sci Fi & Fantasy', 'Science Fiction', 'Comedy'])]
only_music.shape

In [None]:
# filling NA values in composer column
unknown_composer = only_music[only_music['Composer'].isna()].copy()
print(unknown_composer.shape)
only_music.dropna(inplace=True, subset= 'Composer')
print(only_music.shape)
unknown_composer.loc[unknown_composer['Composer'].isna(), 'Composer'] = 'Unknown'
unknown_composer.head()

In [None]:
only_music['Minutes'].hist(bins=20)
plt.show()

In [None]:
itunes_df.loc[0]

In [None]:
### create missing values in Bytes column
import numpy as np
itunes_df.loc[0, 'Bytes'] = np.nan
itunes_df.loc[0]

In [None]:
### KNN (k-nearest neighbors) imputation
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputed = imputer.fit_transform(itunes_df [['Milliseconds', 'Bytes', 'UnitPrice']])
itunes_df['Bytes'] = imputed[:, 1]
itunes_df.loc[0]

In [None]:
def remove_outliers(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    upper_boundary = q3 + 1.5 * iqr
    lower_boundary = q1 - 1.5 * iqr
    new_df = df.loc[(df[column] > lower_boundary) & \
                        (df[column] < upper_boundary)]
    return new_df
itunes_df_clean = remove_outliers(itunes_df, 'Milliseconds')
itunes_df_clean.shape

In [None]:
itunes_df_clean['Minutes'].hist(bins=20)
plt.show()
itunes_df_clean.shape

In [None]:
itunes_df_clean.duplicated(subset = 'Track').sum()

In [None]:
itunes_df_unique = itunes_df_clean.drop_duplicates(subset = 'Track')

In [None]:
itunes_df_unique['Minutes'].hist(bins=20)
plt.show()
itunes_df_unique.shape

In [None]:
# Writing DataFrames to disk
itunes_df.to_csv('data/unique_itunes_data.csv', index=False)

### Wrangling and analyzing

In [None]:
btc_df = pd.read_csv('data/bitcoin_price.csv')
btc_df.head()

In [None]:
btc_df['symbol'].unique() #examining unique values:

In [None]:
# delete symbol column
btc_df.drop('symbol', axis=1, inplace=True)
btc_df.head()

In [None]:
#convert the time column to a pandas datetime
btc_df['time'] = pd.to_datetime(btc_df['time'], unit='ms')
btc_df.head()

In [None]:
btc_df.set_index('time', inplace=True)
btc_df.head()

In [None]:
btc_df.loc['2019']

In [None]:
btc_df['close'].plot(logy=True)

### Numpy

In [None]:
import numpy as np

In [None]:
close_list = btc_df['close'].to_list()
close_array = np.array(close_list)

In [None]:
%timeit kd_close = close_array / 1000

In [None]:
%timeit kd_close_list = [c / 1000 for c in close_list]

In [None]:
btc_df['market_cap'] = btc_df['close'] * btc_df['volume']