# Lyric DEA #

## Imports, Inits, and Method definitions ##

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()

%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

import importlib

import mcnulty_methods
import word_utils
importlib.reload(mcnulty_methods);
importlib.reload(word_utils);
from mcnulty_methods import get_formatted_feature_df, get_lyrics_for_tracks
from word_utils import get_word_counts, generate_word_charts

In [2]:
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 13
mpl.rcParams['ytick.labelsize'] = 13

test_size = 0.2
random_state = 10

In [3]:
def get_artist_term_counts():
    term_counts = pd.read_csv('top_artist_terms.csv', index_col='artist_id', names=['artist_id','term', 'term_count'])
    term_counts = term_counts[~(term_counts['term'] == 'term')]
    del term_counts['term_count']
    return term_counts


def get_term_counts():
    return pd.read_csv('term_counts.csv', names=['term', 'count'])

## Fetch Tracks for Particular Genres

In [4]:
conn = create_engine('postgresql://@localhost:5432/mcnulty_songs').raw_connection()
cursor = conn.cursor()

  """)


In [5]:
features = get_formatted_feature_df(conn)

In [6]:
features.shape

(66730, 11)

In [7]:
features.sample(5)

Unnamed: 0,title,artist_id,artist_name,track_id,term,duration,music_key,loudness,mode,music_tempo,time_signature
39875,I Just Wanna Stop,ARZVJ641187FB36FA4,Alexander Zonjic,TRUNYXI12903CF3832,pop,230.79138,7,-9.499,1,148.065,4
21201,The Moon Was Yellow,AR4F9LJ1187FB44370,Margaret Whiting,TRQBJHT128F92E1D9C,pop,175.46404,11,-17.288,0,139.865,1
4120,Rescuing The G3 Officer,ARV3JXB1187B9B1660,Michael Giacchino,TRWUVGR128F931D5ED,pop,247.14404,5,-20.035,1,49.912,1
61529,White Christmas,ARXZIHJ1187B9A4ED9,Ray Conniff,TRXKVPV128F427194A,pop,167.73179,8,-19.17,1,123.071,1
58213,I'm Just A Prisoner (Of Your Good Lovin'),ARBONOI1187B9A7D62,Janiva Magness,TROEACU128F42510F8,pop,230.47791,1,-5.863,1,113.156,4


## Fetch Lyrics from Tracks ##

In [8]:
genre_labels = ['hip hop', 'pop']
unique_words = set()

all_lyrics = None
hiphop_lyrics = None
pop_lyrics = None

for genre_label in genre_labels:
    genre_df = features[(features['term'] == genre_label)]

    genre_ids = genre_df['track_id']
    
    genre_lyrics = get_lyrics_for_tracks(conn, genre_ids)
    
    if genre_label == 'pop':
        pop_lyrics = genre_lyrics
    elif genre_label == 'hip hop':
        hiphop_lyrics = genre_lyrics
        
    if all_lyrics is None:
        all_lyrics = genre_lyrics
    else:
        all_lyrics = pd.concat([all_lyrics, genre_lyrics])
    


In [9]:
total_count_of_words = all_lyrics.groupby('word')['count'].sum().reset_index()

total_count_of_words.sort_values('count', ascending=False, inplace=True)

total_count_of_words.head(10)

Unnamed: 0,word,count
2146,like,22699
2045,know,21079
2213,love,20439
1569,get,19670
1623,got,16060
1607,go,12534
2585,oh,11859
2519,nigga,11441
3267,see,11026
2608,one,10963


In [10]:
track_word_counts = all_lyrics.groupby('track_id')['count'].sum()

#track_word_counts.sort_values('count', ascending=False, inplace=True)

## Hip Hop: Analyze per track word counts ##

In [11]:
track_word_counts = hiphop_lyrics.groupby('track_id')['count'].sum().reset_index()

track_word_counts.sort_values('count', ascending=False, inplace=True)

track_word_counts['count'].describe()

count    3266.000000
mean      176.584507
std       109.353813
min         1.000000
25%        88.000000
50%       172.000000
75%       246.000000
max      2113.000000
Name: count, dtype: float64

## Pop: Analyze per track word counts ##

May want to consider dropping tracks with very few words

In [12]:
features.set_index('track_id', inplace=True)

## Word Analysis and Reshaping for Modeling ##

## Feature Selection ##

Starting with the top x words found per song in the dataset, we'll add features and record the results from our classification models

In [16]:
word_song_appearance, total_word_appearance = get_word_counts(all_lyrics)
word_subset = word_song_appearance.iloc[:100]
remaining_lyrics = pd.merge(all_lyrics.reset_index(), word_subset[['word']], how='right', on='word')

all_lyrics.reset_index()

Unnamed: 0,track_id,word,count,is_test
0,TRHMWZB128F427AE83,vega,1,False
1,TRHMWZB128F427AE83,statu,1,False
2,TRHMWZB128F427AE83,larg,1,False
3,TRHMWZB128F427AE83,general,1,False
4,TRHMWZB128F427AE83,firm,1,False
5,TRHMWZB128F427AE83,clip,1,False
6,TRHMWZB128F427AE83,credit,1,False
7,TRHMWZB128F427AE83,isol,1,False
8,TRHMWZB128F427AE83,product,1,False
9,TRHMWZB128F427AE83,approach,7,False


In [142]:
def get_X_Y(word_sample_size, include_music=True):
    word_song_appearance, total_word_appearance = get_word_counts(all_lyrics)
    word_subset = word_song_appearance.iloc[:word_sample_size]

    remaining_lyrics = pd.merge(all_lyrics.reset_index(), word_subset[['word']], how='right', on='word')

    remaining_lyrics.set_index('track_id', inplace=True)
    del remaining_lyrics['is_test']
    tid_lyrics = remaining_lyrics.pivot(columns='word', values='count')

    music_features = ['music_duration','music_key','music_loudness', 'music_mode', 'music_tempo', 'music_time_signature']
    if not include_music:
        music_features = []
        
    term_only = features[['term'] + music_features].reset_index().set_index('track_id')
    feature_names = music_features + list(tid_lyrics.columns)
    # complete set,= tid_index -> genre -> word_a -> .... -> word_z
    complete_set = pd.merge(term_only, tid_lyrics, left_index=True, right_index=True, how='right')
    complete_set.fillna(0, inplace=True)


    y_text = np.asarray(complete_set.iloc[:,0])
    y = np.array([1 if val=='hip hop' else 0 for val in y_text])
    X = np.asarray(complete_set.iloc[:,1:])

    return X,y,feature_names

In [144]:
X, y, feature_names = get_X_Y(450)
X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

X_val_fit, X_val_test, y_val_fit, y_val_test = train_test_split(X_val, y_val, test_size=test_size, random_state=random_state)

## Final Run ##

- One with lyrics only, a RandomForestClassifier with the top 300 words. This value was obtained from viewing the word_count trial runs in the 'Expanded model testing with 2-1 hiphop-pop weight' section in the cv_files notebook.
- Another with lyrics and the echo-nest music data. I found that a RandomForestClassifier of the top 500 words works best.

### Lyrics Only ###

In [151]:
X, y, feature_names = get_X_Y(300, include_music=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
lyrics_only = RandomForestClassifier(n_estimators=1000, class_weight={1 : 2, 0 : 1})

lyrics_only.fit(X_train, y_train)
y_test_pred = lyrics_only.predict(X_test)

print('Accuracy: {}'.format(accuracy_score(y_test, y_test_pred)))
print('Recall: {}'.format(recall_score(y_test, y_test_pred)))
print('Precision: {}'.format(precision_score(y_test, y_test_pred)))
print('F1: {}'.format(f1_score(y_test, y_test_pred)))

Accuracy: 0.8168684107770402
Recall: 0.5306427503736921
Precision: 0.696078431372549
F1: 0.6022052586938084


### Lyrics and Music Data ###

In [152]:
X, y, feature_names = get_X_Y(300, include_music=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
lyrics_and_music = RandomForestClassifier(n_estimators=1000, class_weight={1 : 2, 0 : 1})

lyrics_and_music.fit(X_train, y_train)
y_test_pred = lyrics_and_music.predict(X_test)

print('Accuracy: {}'.format(accuracy_score(y_test, y_test_pred)))
print('Recall: {}'.format(recall_score(y_test, y_test_pred)))
print('Precision: {}'.format(precision_score(y_test, y_test_pred)))
print('F1: {}'.format(f1_score(y_test, y_test_pred)))

Accuracy: 0.8211636079656385
Recall: 0.5261584454409567
Precision: 0.7139959432048681
F1: 0.6058519793459552
