In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# data = pd.read_csv(r'../data/music.csv')

## feature engineering

1. generate decades: missing data fill 0
2. artist name : generate first and last characters of the artist name. string type
3. artist mtbags: artist genre. 
   - generate number of artist genre.
   - turn string into id in atmost 3 genre.
4. similar: turn similar into the hotness of the similar artists.
5. terms: turn string into id
6. remove outlier:trim  huge outlier into [0,1]
    - terms_freq
    - time_signature_confidence
7. generate tf-idf sparse matrix：
    - title
    - release name
8. drop location related columns:
    - latitude
    - longitude
    - location
    
### future possible feature engineering
1. generate sentiment analysis on title and release name
2. turn the string variable (non-ordered) into dummy or other feature extraction methods like PCA:
    - artist first and last name
    - mtbags: mtbag_1, mtbag_2, mtbag_3
    - term_id
    
    
    

In [3]:
stop_words = set(stopwords.words('english'))
def remove_stopwords(corpus,stop_words=stop_words,stem = False):
    result = []
    ps = PorterStemmer() 
    for i in corpus:
        if stem:
            song = ' '.join([ps.stem(j) for j in i.split() if j not in stop_words])
        else:
            song = ' '.join([j for j in i.split() if j not in stop_words])
        result.append(song)        
    return result

def tf_idf(corpus):
    vectorizer = TfidfVectorizer()
    tf_df = vectorizer.fit_transform(corpus)
    return tf_df,vectorizer

def generate_tf_idf(se):
    # ------------------------ release name or title ------------------------------------------------- #
    # turn to tf-idf matrix
    release_name = remove_stopwords(se.fillna(''),stem=True)
    release_tfidf,release_voc = tf_idf(release_name)
    return release_tfidf,release_voc

In [4]:
def preprocess(data):
    # ----------------------- generate decades ------------------------------------- #
    data['decade']=data.year.apply(lambda se: se//10*10)

    # ----------------------- artist name ------------------------------------------ #
    data['artist_firstname']=data['artist.name'].apply(lambda se:se[0].upper())
    data['artist_lastname']=data['artist.name'].apply(lambda se: se.split(' ')[-1][:1].upper() )
    
    # ----------------------- mtbags ------------------------------------------------ #
    '''
    "Terms" are the tags provided by The Echo Nest. They can come from a number of places, but mostly blogs as far as we understand. 
     "Mbtags" are musicbrainz tags, specifically applied by humans to a particular artist. This explains why there are fewer of them 
     (see 'mbtags_count'), but they are usually very clean and informative. 
     For instance, if you want to create a genre recognition task where classes are mutually exclusive, mbtags are likely to be more 
     reliable then terms.
    '''
    # total 284 mtbags, but at most 3 tags per songs. 
    # Creat 3 columns for the songs tags
    mbtags = data['artist_mbtags'].apply(lambda se: [x.strip() for x in se.split('and')] if type(se)==str else np.nan)
    unique_tags = set(mbtags.dropna().sum())
    n_tags = len(unique_tags)
    tag2idx=dict(zip(unique_tags,range(1,n_tags+1)))
    tag2idx[np.nan]=0
    idx2tag=dict(zip(range(1,n_tags+1),unique_tags))
    idx2tag[0]=np.nan
    data['mbtags_count']=mbtags.apply(lambda se:len(se) if type(se)==list else 0)
    n_tags_per_song = data['mbtags_count'].max()
    a=mbtags.apply(lambda se:[tag2idx[k] for k in se]+[0]*(3-len(se)) if type(se)==list else [0,0,0]).apply(pd.Series)
    a.columns=['mbtag_'+str(i) for i in range(n_tags_per_song)]
    data=pd.concat([data,a],axis=1)
    
    # -------------------------- similar --------------------------------------------------- #
    # turn similar to the hotness of the similar artists
    artist_hot=data[['artist.id','artist.hotttnesss']].drop_duplicates()
    id2hot = defaultdict(lambda:np.nan,zip(artist_hot['artist.id'], artist_hot['artist.hotttnesss']))
    data['similar_hotness']=data['similar'].apply(lambda se:id2hot[se])
    
    # -------------------------- terms ------------------------------------------------------ #
    # turn the terms to id
    terms = data['terms'].unique()
    data['terms_freq'][data['terms_freq']>1]=1
    n_terms = len(terms)
    term2idx=dict(zip(terms,range(1,n_terms+1)))
    term2idx[np.nan]=0
    idx2term=dict(zip(range(1,n_terms+1),terms))
    idx2term[0]=np.nan
    data['term_id']=data['terms'].apply(lambda se:term2idx[se] )
    
    # --------------------------- remove the outliers ---------------------------------------- #
    data['terms_freq'][data['terms_freq']>1]=1
    data['time_signature_confidence'][data['time_signature_confidence']>1]=1
    
    # ---------------------------- to do ------------------------------------------------------ #
    # add sentiment analysis on title and release name.
    
    
    return data, tag2idx,idx2tag, term2idx,idx2term


In [5]:
# data,tag2idx,idx2tag,term2idx,idx2term=preprocess(data)
# data.to_csv(r'..\data\music_clean.csv')

# Store the data from disk and work from now

In [11]:
data=pd.read_csv(r'../data/music_clean.csv',index_col=0)

In [12]:
release_tfidf,release_voc=generate_tf_idf(data['release.name'])
title_tfidf,title_voc=generate_tf_idf(data['title'])

In [16]:
data.describe()

Unnamed: 0,artist.hotttnesss,artist_mbtags_count,bars_confidence,bars_start,beats_confidence,beats_start,duration,end_of_fade_in,familiarity,key,...,time_signature,time_signature_confidence,year,decade,mbtags_count,mbtag_0,mbtag_1,mbtag_2,similar_hotness,term_id
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,9996.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,2316.0,10000.0
mean,0.385552,0.524686,0.239595,1.065246,0.613963,0.428497,240.622038,0.756708,0.565456,5.36658,...,3.564443,0.509996,934.7046,932.431,0.4588,52.1838,15.3377,0.0178,0.441496,119.4925
std,0.143647,0.884095,0.288259,1.723468,0.322441,0.806217,246.08409,1.858958,0.160161,9.671788,...,1.26662,0.37344,996.650657,994.225985,0.651111,84.253195,54.517359,1.78,0.134207,100.028535
min,0.0,0.0,0.0,0.0,0.0,-60.0,1.04444,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.325266,0.0,0.035,0.44159,0.40975,0.194655,176.0322,0.0,0.467611,2.0,...,3.0,0.09775,0.0,0.0,0.0,0.0,0.0,0.0,0.372227,40.0
50%,0.380742,0.0,0.12,0.78546,0.686,0.332585,223.05914,0.199,0.563666,5.0,...,4.0,0.551,0.0,0.0,0.0,0.0,0.0,0.0,0.421418,94.0
75%,0.453858,1.0,0.351,1.224075,0.882,0.500753,276.37506,0.421,0.66802,8.0,...,4.0,0.864,2000.0,2000.0,1.0,93.0,0.0,0.0,0.526847,177.0
max,1.082503,9.0,8.85524,59.74354,1.0,12.24583,22050.0,43.119,1.0,904.80281,...,7.0,1.0,2010.0,2010.0,3.0,284.0,281.0,178.0,1.021256,459.0


## Benchmark model