In [None]:
import pandas as pd
import numpy as np
import requests
import os 
from dotenv import load_dotenv
import requests
from tqdm import tqdm
import pickle
import re
pd.options.display.max_columns = None
from fuzzywuzzy import process, fuzz
from collections import Counter
import calendar

import sys
sys.path.append("../")
import src.soporteAPIs as sa
sys.path.append("../")
import src.biblioteca as bb

# Importing pickle file

In [None]:
with open('../data/pickle/artist.pickle', 'rb') as tracks:
    tracks = pickle.load(tracks)

In [None]:
tracks.head()

In [None]:
tracks.rename(columns={'track_clean':'track','artist_clean':'artist'}, inplace=True)
tracks.head()

I will split the artist column in order to split between main artist (`artist_0`) and possible featuring artist (`artist_1`, `artist_1`)

In [None]:
test = tracks['artist'].str.replace('tyler, the creator', 'tyler the creator').str.split(',', n=2, expand = True)
test

In [None]:
tracks = pd.concat([tracks, test], axis = 1)
tracks.head()

In [None]:
tracks.rename(columns={0:'artist_0', 1:'artist_1', 2:'artist_2'}, inplace=True)
tracks.head()

# lastfm
In this part of the process I will extract form the LastFM API all the data related to the tracks and artist genres.

First, I need to configure my credentials.

In [None]:
load_dotenv()

In [None]:
api = os.getenv("lastfm-id")
username = os.getenv("lastfm-user")

Testing to gather the data from the api with a function:

In [None]:
res = sa.getLastFMData({
    'method': 'artist.getInfo',
    'artist':'Harry Styles',
})
res.json()['artist']['bio']

First I try on a slice of my dataframe

In [None]:
tqdm.pandas()

test = tracks.sample(100)
test[['bio', 'artist_tag']] = test.progress_apply(lambda x: sa.getArtistInfo(x.artist_0), axis=1, result_type='expand')

In [None]:
test.head()

This is for the whole dataframe:

In [None]:
tqdm.pandas()

#test = tracks.sample(100)
tracks[['bio', 'artist_tag']] = tracks.progress_apply(lambda x: sa.getArtistInfo(x.artist_0), axis=1, result_type='expand')

In [None]:
tracks.head()

Now I get the tags for each individual track:

In [None]:
res = sa.getLastFMData({
    'method': 'track.getInfo',
    'artist':'Harry Styles',
    'track':'As It Was'
})
res.json()['track']['toptags']['tag']#['published']#[0]['name']

This is how I will get info for a slice of my dataframe:

In [None]:
tqdm.pandas()

test = tracks.sample(100)
test[['track_tag', 'published_track']] = test.progress_apply(lambda x: sa.getTrackTags(x.artist_0, x.track), axis=1, result_type='expand')

In [None]:
print(test['track_tag'].isnull().sum())
test.head()

In [None]:
test.tail()

This is for the whole dataset:

In [None]:
tqdm.pandas()

tracks[['track_tag', 'published_track']] = tracks.progress_apply(lambda x: sa.getTrackTags(x.artist_0, x.track), axis=1, result_type='expand')

In [None]:
print(tracks['track_tag'].isnull().sum(), tracks['artist_tag'].isnull().sum())
tracks.head()

In [None]:
with open('../data/pickle/lastfm_dump.pickle', 'wb') as data_lastfm:
    pickle.dump(tracks, data_lastfm)

# Data Cleaning

In [None]:
with open('../data/pickle/lastfm_dump.pickle', 'rb') as tracks:
    tracks = pickle.load(tracks)

Cleaning the new `bio` data

In [None]:
bio = tracks['bio'].apply(pd.Series)
bio.head()
#artist_tag['tag'] = artist_tag['tag'].apply(pd.Series)

Cleaning the `artist_tag` column

In [None]:
artist_tag = tracks['artist_tag'].apply(pd.Series)
new_columns = []
for item in artist_tag.columns.to_list():
    new_columns.append('aritist_genre_'+str(item))
artist_tag.columns = new_columns
#print(artist_tag.columns.to_list(), new_columns)
artist_tag.head()

In [None]:
# cleaning unnecesary dictionaries
for index, row in artist_tag.iterrows():
    for column in artist_tag.columns.tolist():
        try:
            row[column] = list(row[column].values())[0] 
            #print(row[column])
        except:
            continue
artist_tag.head()

Cleaning `track_tag` column:

In [None]:
track_tag = tracks['track_tag'].apply(pd.Series)
new_columns = []
for item in track_tag.columns.to_list():
    new_columns.append('track_genre_'+str(item))
track_tag.columns = new_columns
track_tag.head()

In [None]:
# cleaning unnecesary dictionaries
for index, row in track_tag.iterrows():
    for column in track_tag.columns.tolist():
        try:
            row[column] = list(row[column].values())[0] 
        except:
            continue
track_tag.head()

Merging the clean columns to the original dataframe and removing unnecesary columns:

In [None]:
tracks_final = pd.concat([tracks, bio, artist_tag, track_tag], axis = 1)
tracks_final.head()

In [None]:
tracks_final.drop(['bio', 'artist_tag', 'track_tag', 'links', 0], axis=1, inplace=True)
tracks_final.head()

# Adding new columns

Generating a new clean genres column based on the extracted ones from de apis.

In [None]:
tracks_final.head(1)

In [None]:
tracks_final['music_genre'] = np.where(tracks_final['aritist_genre_0'].isnull() == True, tracks_final['track_genre_0'].str.lower(), tracks_final['aritist_genre_0'].str.lower())
tracks_final['music_genre'].value_counts()

In [None]:
count_genres = Counter(genres for genres in tracks_final['music_genre'])
print(f"There are {len(count_genres)} different music genres.")

In [None]:
dict_genres = dict(count_genres.most_common(25))

In [None]:
fuzz.ratio('hip-hop', "hip hop")

In [None]:
tracks_final["clean_music_genre"] = tracks_final.apply(lambda x: sa.music_genres(x["music_genre"], dict_genres), axis = 1)
tracks_final["clean_music_genre"].value_counts()

Adding gender info column.

In [None]:
tracks_final.head(1)

In [None]:
tracks_final["gender"] = tracks_final['summary'].apply(sa.generos)

In [None]:
tracks_final['gender'].value_counts() / tracks_final.shape[0]

Age from artist

-- taking the data from the summary

In [None]:
tracks_final['birthday'] =  tracks_final['content'].apply(lambda x: re.findall(r'\w{1,} \d{1,2}, \d{4}|\d{1,2} \w{4,} \d{4}',str(x))).str[0]
tracks_final['birthday'].value_counts()

In [None]:
tracks_final[tracks_final['gender'] != 'group']['birthday'].isnull().sum()

In [None]:
tracks_final[[1, 2, 3]] = tracks_final['birthday'].str.split(' ', expand=True)
tracks_final.head(1)

In [None]:
tracks_final['month_text'] = ''
tracks_final['day'] = ''
for index, row in tqdm(tracks_final.iterrows(), total = tracks_final.shape[0]):
    try:
        if ',' in row[2]:
            #print('str')
            row['month_text'] = row[1]
            row['day'] = row[2].replace(',', '')
    except:
        if row[2] == np.nan:
            #print('nan')
            row['month_text'] = np.nan
            row['day'] = np.nan
        else:
            #print('float')
            row['month_text'] = row[2]
            row['day'] = row[1]

In [None]:
tracks_final['month_text'].value_counts()

In [None]:
tracks_final.head(1)

In [None]:
tracks_final["month"] = tracks_final["month_text"].apply(sa.moth_as_numer)
tracks_final["month"].value_counts()

# EDA

Total number of rows and columns of the dataframe

In [None]:
tracks_final.shape

Counting duplicated rows

In [None]:
tracks_final.duplicated().sum()

Counting null values

In [None]:
tracks_final.isnull().sum() / tracks_final.shape[0]

In [None]:
tracks_final.info()

The only null columns available on the dataset are on the `streams` column. This one I will enrich it with api info.

Main statistic values for numeric columns

In [None]:
tracks_final.describe().T

Main statistic values for categorical columns

In [None]:
tracks_final.describe(include='object').T

# Export

In [None]:
with open('../data/pickle/lastfm.pickle', 'wb') as data_lastfm:
    pickle.dump(tracks_final, data_lastfm)