<b>Data mining project - 2020/21</b><br>
<b>Authors</b>: [Alexandra Bradan](https://github.com/alexandrabradan), [Alice Graziani](https://github.com/alicegraziani25) and [Eleonora Cocciu](https://github.com/eleonoracocciu)<br>
<b>Python version</b>: 3.x<br>
<b>Last update: 21/05/2021<b>

In [354]:
# system library
import os
import sys
import json
import pickle
import isoweek
import datetime
import calendar
from datetime import date, timedelta
from dateutil.relativedelta import relativedelta

# useful libraries
import math
import operator
import itertools
import statistics
import collections
from collections import Counter
from collections import OrderedDict

# pandas
import pandas as pd

# numpy
import numpy as np
from numpy import std
from numpy import mean
from numpy import percentile

# visualisarion
import pydotplus
import seaborn as sns
from matplotlib import colors
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from IPython.display import Image

# sklearn
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix

# dimensional reducers
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif  # classification
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression  # regression

# scalers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder

# performance visualisation 
from sklearn import tree
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall
from scikitplot.metrics import plot_cumulative_gain
from scikitplot.metrics import plot_lift_curve
from sklearn.model_selection import learning_curve
from mlxtend.plotting import plot_decision_regions
from yellowbrick.model_selection import LearningCurve

# svm
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

plt.rcParams["patch.force_edgecolor"] = True
%matplotlib inline

from yellowbrick.style import set_palette
set_palette('bold')

In [355]:
ts_column_name = 'frequency'

<h6> Datasets loading </h6>

In [356]:
df = pd.read_csv('../../data/fma_metadata/group_20_fma.csv', index_col=0)
tracks = pd.read_csv('../../data/fma_metadata/tracks.csv', index_col=0, header=[0, 1])
print(df.shape)

(103708, 44)


<h6>Recompose track_date_created column</h6>

In [357]:
date_created_column = pd.to_datetime(tracks[('track', 'date_created')], format='%Y%m', errors='ignore').astype('datetime64[ns]')
track_date_created_year_index = df.columns.get_loc("track_date_created_year")
track_date_created_season_index = df.columns.get_loc("track_date_created_season")

In [358]:
df.insert(track_date_created_year_index, 'track_date_created', date_created_column)
del df["track_date_created_year"]
del df["track_date_created_season"]

<h6>Marginal datetime records removal </h6>

Track creation spans between **2008-11-25 17:49:06** and **2017-03-30 15:23:39**, but marginal dates of such periods were deleted, since it was noticed that they were incomplete. So we analyzed the period spanning between **2008-12-29 00:00:00 and 2017-01-01 00:00:00**, reshaped by making each day having 24 hours and adding 186 missing days. The last three days of 2008 were approximated to 2009 since the were respectivelly Monday (29/12), Tuesday (30/12) and Wendsday (31/12), while the fisrt day of 2017 was approximated with 2016 since it was a Sunday (01/01).

By removing marginal dates to have 8 complete years, we got rid of and 5.34% of Studio Recording tracks (2008: {11: 1263, 12: 28} and 2017: {1: 995, 2: 1619, 3: 1023}) and 16.32% of Live Recording tracks (2008:Counter({11: 186, 12: 1139}) and 2017: {1: 223, 2: 212, 3: 121}). 

In [359]:
# approximated marginal weeks
first_year_keep = df[(df['track_date_created'].dt.year == 2008) & 
                    (df['track_date_created'].dt.month == 12) & 
                    (df['track_date_created'].dt.day >= 29)] 


last_year_keep = df[(df['track_date_created'].dt.year == 2017) & 
                    (df['track_date_created'].dt.month == 1) & 
                    (df['track_date_created'].dt.day == 1)]

to_keep = set(list(first_year_keep.index) + list(last_year_keep.index))

In [360]:
first_year = df[df['track_date_created'].dt.year == 2008] 
last_year = df[df['track_date_created'].dt.year == 2017]
first_year_studio = first_year[first_year['album_type'] == "Studio Recording"]
last_year_studio = last_year[last_year['album_type'] == "Studio Recording"]
first_year_live = first_year[first_year['album_type'] == "Live Recording"]
last_year_live = last_year[last_year['album_type'] == "Live Recording"]

In [361]:
print("Studio removed records", len((set(first_year_studio.index).union(set(last_year_studio.index))).difference(to_keep)))
print("Live removed records", len((set(first_year_live.index).union(set(last_year_live.index))).difference(to_keep)))
print("total removed records", len((set(first_year.index).union(set(last_year.index))).difference(to_keep)))

Studio removed records 4917
Live removed records 1881
total removed records 6798


In [362]:
# removing marginal weeks, months and years
to_drop = set(list(first_year.index) + list(last_year.index)).difference(to_keep)
df.drop(to_drop, axis=0, inplace=True)
df.shape

(96910, 43)

<h6> Create TS DataFrame </h6>

In [363]:
# truncated datetime at hours (not interested in minutes, seconds and microsenconds)
track_date_created = []
for d in df['track_date_created']:
    d_approximated = d.replace(minute=0, second=0, microsecond=0)
    track_date_created.append(d_approximated)
    
# create a new temporal df
data = {'track_date_created': track_date_created, 'album_type': df['album_type'].values}
tmp_df_ts = pd.DataFrame(data=data)

In [364]:
# check id exist missing days
start = tmp_df_ts['track_date_created'].min()
end = tmp_df_ts['track_date_created'].max()
print(start, end)

delta = end - start
print('days between start and end:', delta.days + 1)  

# encode yearly days as contigous integers 
days = {}
for i in range(0, delta.days + 1):  # adding 1 extra days to contemplate end
    day = start + timedelta(days=i)
    key = day.year, day.month, day.day
    days[key] = None

2008-12-30 07:00:00 2017-01-01 21:00:00
days between start and end: 2925


In [365]:
# create final df, without sequential hours
track_date_created_dict = {}
for t in tmp_df_ts['track_date_created'].unique():
    counter = Counter(tmp_df_ts[tmp_df_ts['track_date_created'] == t]['album_type'].values)
    t = pd.Timestamp(t)
    try:
        tmp = track_date_created_dict[t]
    except KeyError:
        track_date_created_dict[t] = {}
        track_date_created_dict[t]['Live Recording'] = 0
        track_date_created_dict[t]['Studio Recording'] = 0
        
    for key, value in counter.items():
        track_date_created_dict[t][key] += value
        
# add missing days to DataFrame
current_days = set()
for t in track_date_created_dict.keys():
    key = t.year, t.month, t.day
    current_days.add(key)
        
missing_days = set(days).difference(current_days)
missing_days.add((2008, 12, 29))
print("missing_days", len(missing_days))

for t in missing_days:
    t = pd.Timestamp(t[0], t[1], t[2], 0)
    track_date_created_dict[t] = {}
    track_date_created_dict[t]['Live Recording'] = 0
    track_date_created_dict[t]['Studio Recording'] = 0

data = {'track_date_created': list(track_date_created_dict.keys()), 
        'Studio_Recording_' + ts_column_name: [x['Studio Recording'] for x in list(track_date_created_dict.values())], 
        'Live_Recording_' + ts_column_name: [x['Live Recording'] for x in list(track_date_created_dict.values())]}

df_ts = pd.DataFrame(data=data)
df_ts = df_ts.set_index('track_date_created')
df_ts = df_ts.sort_index()

missing_days 187


In [366]:
# we need to make all days have the same amouth of hours [0-23]
keys = list(track_date_created_dict.keys())
keys_hours = {}
for t in keys:
    try:
        tmp_list = keys_hours[t.year, t.month, t.day]
        tmp_list.append(t.hour)
        keys_hours[t.year, t.month, t.day] = tmp_list
    except KeyError:
        keys_hours[t.year, t.month, t.day] = [t.hour]


# create final df, with sequential hours
new_track_date_created_dict = {}
for key, value in keys_hours.items():
    hours = keys_hours[key[0], key[1], key[2]]

    k = key[0], key[1], key[2]
    for h in range(0, 24):
        t = pd.Timestamp(k[0], k[1], k[2], h)
        
        if h in hours:
            counter = {'Live Recording': track_date_created_dict[t]['Live Recording'],
                       'Studio Recording': track_date_created_dict[t]['Studio Recording']}
        else:
            counter = {'Live Recording': 0,
                       'Studio Recording': 0}
        try:
            tmp = new_track_date_created_dict[t]
        except KeyError:
            new_track_date_created_dict[t] = {}
            new_track_date_created_dict[t]['Live Recording'] = 0
            new_track_date_created_dict[t]['Studio Recording'] = 0

        for key, value in counter.items():
            new_track_date_created_dict[t][key] += value
            
data = {'track_date_created': list(new_track_date_created_dict.keys()), 
        'Studio_Recording_' + ts_column_name: [x['Studio Recording'] for x in list(new_track_date_created_dict.values())], 
        'Live_Recording_' + ts_column_name: [x['Live Recording'] for x in list(new_track_date_created_dict.values())]}
final_df_ts = pd.DataFrame(data=data)
final_df_ts = final_df_ts.set_index('track_date_created')
final_df_ts = final_df_ts.sort_index()
final_df_ts.shape

(70224, 2)

<h6> Saving DataFrame on file </h6>

In [367]:
filename = "TS_album_type_" + ts_column_name + ".csv"
final_df_ts.to_csv('../../data/fma_metadata/' + filename, index=True, header=True)