## Loading and Saving Data with Pandas

The pygmentize command-line tool can be used to display a text file in IPython or Jupyter.  It can be installed with `conda install -c conda-forge pygments` or with pip. We can take a look at the CSV with this tool:

In [None]:
!pygmentize -l text data/itunes_data.csv

In [None]:
import pandas as pd

In [None]:
csv_df = pd.read_csv('data/itunes_data.csv')
csv_df.head()

In [None]:
excel_df = pd.read_excel('data/itunes_data.xlsx', engine='openpyxl')
excel_df.head()

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data/chinook.db')

In [None]:
query = """SELECT tracks.name as Track, tracks.composer, tracks.milliseconds,
tracks.bytes, tracks.unitprice,
genres.name as Genre,
albums.title as Album,
artists.name as Artist
FROM tracks
JOIN genres ON tracks.genreid = genres.genreid
JOIN albums ON tracks.albumid = albums.albumid
JOIN artists ON albums.artistid = artists.artistid;
"""

In [None]:
with engine.connect() as connection:
    sql_df = pd.read_sql_query(query, connection)

In [None]:
sql_df.head(2).T

In [None]:
# create dataframe from lists
df = pd.DataFrame(data={'seconds': [1, 2, 3, 4], 'intensity': [12, 11, 12, 14]})
df.head()

## Understanding the DataFrame Structure and Combining/Concatenating Multiple DataFrames

In [None]:
sql_df.index

In [None]:
sql_df.columns

In [None]:
type(sql_df)

In [None]:
itunes_df = pd.concat([csv_df, excel_df, sql_df], axis=0)
itunes_df.head()

## Exploratory Data Analysis (EDA) and Basic Data Cleaning with pandas

In [None]:
itunes_df.tail()

In [None]:
print(itunes_df.iloc[0])
print(itunes_df.iloc[-1])

In [None]:
itunes_df.iloc[0, 0]

In [None]:
itunes_df.iloc[-1, -1]

In [None]:
itunes_df.loc[3502]

In [None]:
test_df = itunes_df.copy()
test_df = test_df.append(itunes_df.loc[3502])
test_df.loc[3502]

In [None]:
test_df.reset_index(inplace=True, drop=True)
test_df.head()

In [None]:
test_df.index

In [None]:
itunes_df.shape

In [None]:
itunes_df.info()

In [None]:
itunes_df.isna().sum()

In [None]:
type(itunes_df.isna())

In [None]:
itunes_df.describe()

In [None]:
itunes_df['Genre'].mode()

In [None]:
itunes_df['Genre'].value_counts()

In [None]:
itunes_df['Genre'].value_counts()[:5]

In [None]:
itunes_df['Artist'].unique().shape

In [None]:
itunes_df.corr()

## Plotting with DataFrames

In [None]:
import matplotlib.pyplot as plt

In [None]:
itunes_df['Milliseconds'].hist(bins=30)
plt.show()

In [None]:
# this cell is for saving the image; more on this is chapter 5
f = plt.figure(figsize=(5.5, 5.5))  # this changes the size of the image -- more on this is chapter 5
f.patch.set_facecolor('w')  # sets background color behind axis labels
itunes_df['Milliseconds'].hist(bins=30)
plt.tight_layout()  # auto-adjust margins

In [None]:
# figsize increases the size of the image -- more on this is chapter 5
itunes_df.plot.scatter(x='Milliseconds', y='Bytes', figsize=(8, 8))
plt.show()

In [None]:
# saving the image -- more on this in chapter 5
f = plt.figure()
itunes_df.plot.scatter(x='Milliseconds', y='Bytes', figsize=(5.5, 5.5))
f.patch.set_facecolor('w')  # sets background color behind axis labels
plt.tight_layout()  # auto-adjust margins

In [None]:
itunes_df['Genre'].value_counts().plot.bar()
plt.show()

In [None]:
# saving the image -- more on this in chapter 5
f = plt.figure(figsize=(5.5, 5.5))
itunes_df['Genre'].value_counts().plot.bar()
f.patch.set_facecolor('w')  # sets background color behind axis labels
plt.tight_layout()  # auto-adjust margins

## Cleaning Data, Filtering DataFrames

In [None]:
itunes_df[itunes_df['Milliseconds'] > 4e6]

In [None]:
print(itunes_df[itunes_df['Milliseconds'] > 4e6])

In [None]:
print(itunes_df[itunes_df['Milliseconds'] > 4e6][['Genre', 'Artist']])

In [None]:
itunes_df['Milliseconds'] > 4e6

In [None]:
itunes_df[itunes_df['Milliseconds'] > 2e6]['Genre'].value_counts()

In [None]:
itunes_df[(itunes_df['Milliseconds'] > 2e6) & (itunes_df['Bytes'] < 0.4e9)]['Genre'].value_counts()

In [None]:
itunes_df[itunes_df['Genre'] != 'TV Shows']['Genre'].value_counts()

In [None]:
itunes_df[~(itunes_df['Genre'] == 'TV Shows')]['Genre'].value_counts()

In [None]:
itunes_df[itunes_df['Genre'].str.contains('TV')]['Genre'].value_counts()

In [None]:
itunes_df_copy = itunes_df.copy()  # make a backup to test dropping a column
itunes_df_copy.drop('Composer', axis=1, inplace=True)
itunes_df_copy.columns

In [None]:
only_music = itunes_df[~itunes_df['Genre'].isin(['Drama', 'TV Shows', 'Sci Fi & Fantasy', 'Science Fiction', 'Comedy'])]

### Missing values

In [None]:
itunes_df[itunes_df['Composer'].isna()].sample(5, random_state=42).head()

In [None]:
itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column
# print shapes to check that rows were dropped
print(itunes_df_copy.shape)
itunes_df_copy.dropna(inplace=True)
print(itunes_df_copy.shape)

In [None]:
itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column
itunes_df_copy.loc[itunes_df['Composer'].isna(), 'Composer'] = 'Unknown'
itunes_df_copy.head()

In [None]:
itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column
itunes_df_copy['Composer'].fillna('Unknown', inplace=True)
itunes_df_copy.head()

In [None]:
itunes_df['UnitPrice'].value_counts().iloc[0]/itunes_df.shape[0]

In [None]:
itunes_df_copy = itunes_df.copy()  # make a backup for testing dropping a column
itunes_df_copy['UnitPrice'].fillna(itunes_df_copy['UnitPrice'].mode(), inplace=True)
itunes_df_copy.head()

In [None]:
# create distributions for plotting
from scipy.stats import skewnorm
df = pd.DataFrame({'normal': skewnorm.rvs(0, size=10000), 'skewed': skewnorm.rvs(10, size=10000)})

df.plot.hist(bins=30, alpha=0.5, figsize=(5.5, 5.5))
plt.vlines(df['skewed'].mean(), ymin=0, ymax=2000, label='mean', color='r')
plt.vlines(df['skewed'].median(), ymin=0, ymax=2000, label='median', color='b')
f.patch.set_facecolor('w')  # sets background color behind axis labels
plt.tight_layout()  # auto-adjust margins

#### KNN imputation

In [None]:
import numpy as np
itunes_df_copy = itunes_df.copy()  # create copy of the dataframe so we don't alter the original
itunes_df_copy.loc[0, 'Bytes'] = np.nan

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputed = imputer.fit_transform(itunes_df_copy[['Milliseconds', 'Bytes', 'UnitPrice']])

In [None]:
# there are multiple values with the index value of 0, which is why we get multiple results
itunes_df.loc[0, 'Bytes']

In [None]:
itunes_df_copy['Bytes'] = imputed[:, 1]

In [None]:
itunes_df_copy.loc[0, 'Bytes']

In [None]:
itunes_df['Bytes'].mean()

In [None]:
# adapted from this SO answer: https://stackoverflow.com/a/46740476/4549682
def remove_outliers(df, column):
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    upper_boundary = q3 + 1.5 * iqr
    lower_boundary = q1 - 1.5 * iqr
    new_df = df.loc[(df[column] > lower_boundary) & (df[column] < upper_boundary)]
    return new_df

In [None]:
itunes_df_clean = remove_outliers(itunes_df, 'Milliseconds')

In [None]:
itunes_df_clean.shape

In [None]:
itunes_df.shape

### Duplicate values

In [None]:
itunes_df.duplicated().sum()

In [None]:
itunes_df.drop_duplicates(inplace=True)

## Data Transformations

In [None]:
itunes_df['Seconds'] = itunes_df['Milliseconds'] / 1000

In [None]:
itunes_df['len_byte_ratio'] = itunes_df['Milliseconds'] / itunes_df['Bytes']

### Apply and Map

In [None]:
genre_dict = {'metal': 'Metal', 'met': 'Metal'}
itunes_df['Genre'].replace(genre_dict)

In [None]:
itunes_df['Genre'].apply(lambda x: x.lower())

In [None]:
# the above is the same as this
def lowercase(x):
    return x.lower()

itunes_df['Genre'].apply(lowercase)

In [None]:
# but using built-in functions is almost always faster
itunes_df['Genre'].str.lower()

Here would be an exception to the rule for using apply -- we want to get a hybrid score of the polarity and subjectivty of text:

In [None]:
# this is a common sentiment analysis library; polarity is positive/negative sentiment,
# subjectivety is subjective/objective rating.
from textblob import TextBlob
test = TextBlob("Textblob is amazingly simple to use. What great fun!")
test.sentiment

In [None]:
test.sentiment.polarity

In [None]:
# it would be better than apply to use a list comprehension to get sentiment of track names, like this
itunes_df['Track_sentiment'] = [TextBlob(x).sentiment.polarity for x in itunes_df['Track']]

In [None]:
# but, if we wanted to mix polarity and subjectivity into one column, it would be best to use apply:
def pol_sub_mix(x):
    tb = TextBlob(x)
    return tb.polarity * tb.subjectivity

itunes_df['Track_pol_sub_mix'] = itunes_df['Track'].apply(pol_sub_mix)

In [None]:
# delete these columns
itunes_df.drop(['Track_pol_sub_mix', 'Track_sentiment'], inplace=True, axis=1)

In [None]:
# currently doesn't work with python 3.9
# import swifter
# itunes_df['Genre'].swifter.apply(lambda x: x.lower())

### Group By

In [None]:
itunes_df.groupby('Genre').mean()['Seconds'].sort_values().head()

### Saving data

In [None]:
itunes_df.to_csv('data/cleaned_itunes_data.csv', index=False)

# Bitcoin data analysis

In [None]:
btc_df = pd.read_csv('data/bitcoin_price.csv')
btc_df.head()

In [None]:
btc_df['symbol'].unique()

In [None]:
btc_df.drop('symbol', axis=1, inplace=True)

In [None]:
btc_df['time'] = pd.to_datetime(btc_df['time'], unit='ms')

In [None]:
btc_df['time'].dtype

In [None]:
btc_df.info()

In [None]:
btc_df.set_index('time', inplace=True)

In [None]:
btc_df.head()

In [None]:
btc_df[['close']].plot(logy=True)

In [None]:
f = plt.figure(figsize=(5.5, 5.5))
btc_df.iloc[-3000:][['close']].plot(logy=True, figsize=(5.5, 5.5))
f.patch.set_facecolor('w')  # sets background color behind axis labels
plt.tight_layout()  # auto-adjust margins

We could load a datetime as a pandas timestamp like so. Unfortunately, this trick doesn't work with seconds since the epoch, but does work with other datetime formats. For seconds since the epoch, we can provide a conversion function to the argument `date_parser` in `read_csv`, or convert it with `to_datetime` after loading.

In [None]:
btc_df2 = pd.read_csv('data/bitcoin_price.csv', index_col='time', parse_dates=['time'], infer_datetime_format=True)
btc_df2.head()

In [None]:
date_parser = lambda x: pd.to_datetime(x, unit='ms')
btc_df2 = pd.read_csv('data/bitcoin_price.csv', index_col='time', parse_dates=['time'], date_parser=date_parser)
btc_df2.head()

In [None]:
btc_df.loc['1-1-2019':'12-31-2019']

In [None]:
btc_df.loc['2019']

# NumPy

In [None]:
close_array = btc_df['close'].values

In [None]:
close_array

In [None]:
close_array.shape

In [None]:
close_array.dtype

In [None]:
import numpy as np

close_list = btc_df['close'].to_list()
close_array = np.array(close_list)
close_array

In [None]:
%timeit kd_close = close_array / 1000

In [None]:
%timeit kd_close_list = [c / 1000 for c in close_list]

In [None]:
volume_array = btc_df['volume'].values
close_array * volume_array

In [None]:
btc_df['market_cap'] = btc_df['close'] * btc_df['volume']
btc_df['market_cap']

In [None]:
np.log(btc_df['close'])