# Lyric DEA #

## Imports, Inits, and Method definitions ##

In [4]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()

%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

import importlib

import mcnulty_methods
import word_utils
importlib.reload(mcnulty_methods);
importlib.reload(word_utils);
from mcnulty_methods import get_formatted_feature_df, get_lyrics_for_tracks
from word_utils import get_word_counts, generate_word_charts

In [None]:
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 13
mpl.rcParams['ytick.labelsize'] = 13

test_size = 0.2
random_state = 10

In [3]:
def get_artist_term_counts():
    term_counts = pd.read_csv('top_artist_terms.csv', index_col='artist_id', names=['artist_id','term', 'term_count'])
    term_counts = term_counts[~(term_counts['term'] == 'term')]
    del term_counts['term_count']
    return term_counts


def get_term_counts():
    return pd.read_csv('term_counts.csv', names=['term', 'count'])

## Fetch Tracks for Particular Genres

In [4]:
conn = create_engine('postgresql://@localhost:5432/mcnulty_songs').raw_connection()
cursor = conn.cursor()

  """)


In [5]:
features = get_formatted_feature_df(conn)

In [6]:
features.shape

(66730, 11)

In [7]:
features.sample(5)

Unnamed: 0,title,artist_id,artist_name,track_id,term,duration,music_key,loudness,mode,music_tempo,time_signature
12667,Dakar veut du biff,ARQGYAE11F4C846DF6,Alpha 5.20,TRCXTRF128F42845FC,hip hop,161.04444,9,-5.202,1,92.241,4
8706,Cock Robin,ARY8MOG1187B999006,Buell Kazee,TRHIBDF128F934D1CA,pop,135.1571,4,-21.793,0,106.008,7
61198,More Than This,ARFTXLE122C8675A0A,Mark-Anthony Abel,TRXDNGY12903CA8385,pop,237.66159,11,-4.892,0,141.954,3
42581,Everytime I Think Of You,AROVVFJ1269FCD35B6,Marco Borsato / Lucie Silvas,TRLTVHA128F92FEA8F,pop,248.13669,0,-4.909,1,118.253,4
56975,Gambling,ARU41P31187B9A6DD3,Samian,TRORWVM128F9323385,hip hop,219.81995,11,-4.076,0,81.389,4


## Fetch Lyrics from Tracks ##

In [10]:
genre_labels = ['hip hop', 'pop']
unique_words = set()

all_lyrics = None
hiphop_lyrics = None
pop_lyrics = None

for genre_label in genre_labels:
    genre_df = features[(features['term'] == genre_label)]

    genre_ids = genre_df['track_id']
    
    genre_lyrics = get_lyrics_for_tracks(conn, genre_ids)
    
    if genre_label == 'pop':
        pop_lyrics = genre_lyrics
    elif genre_label == 'hip hop':
        hiphop_lyrics = genre_lyrics
        
    if all_lyrics is None:
        all_lyrics = genre_lyrics
    else:
        all_lyrics = pd.concat([all_lyrics, genre_lyrics])
    

In [294]:
total_count_of_words = all_lyrics.groupby('word')['count'].sum().reset_index()

total_count_of_words.sort_values('count', ascending=False, inplace=True)

total_count_of_words.head(10)

Unnamed: 0,word,count
2146,like,22699
2045,know,21079
2213,love,20439
1569,get,19670
1623,got,16060
1607,go,12534
2585,oh,11859
2519,nigga,11441
3267,see,11026
2608,one,10963


In [295]:
track_word_counts = all_lyrics.groupby('track_id')['count'].sum()

#track_word_counts.sort_values('count', ascending=False, inplace=True)

## Hip Hop: Analyze per track word counts ##

In [296]:
track_word_counts = hiphop_lyrics.groupby('track_id')['count'].sum().reset_index()

track_word_counts.sort_values('count', ascending=False, inplace=True)

track_word_counts['count'].describe()

count    3266.000000
mean      176.584507
std       109.353813
min         1.000000
25%        88.000000
50%       172.000000
75%       246.000000
max      2113.000000
Name: count, dtype: float64

## Pop: Analyze per track word counts ##

In [297]:
track_word_counts = pop_lyrics.groupby('track_id')['count'].sum().reset_index()

track_word_counts.sort_values('count', ascending=False, inplace=True)

track_word_counts['count'].describe()

count    10224.000000
mean       103.895833
std         79.404171
min          1.000000
25%         52.000000
50%         83.000000
75%        130.000000
max       1344.000000
Name: count, dtype: float64

May want to consider dropping tracks with very few words

In [298]:
features.set_index('track_id', inplace=True)

## Word Analysis and Reshaping for Modeling ##

In [299]:
word_song_appearance, total_word_appearance = get_word_counts(all_lyrics)

print('Total Unique Words: {}'.format(word_song_appearance.shape[0]))
print('-------------------')
print(word_song_appearance.describe())
print('-------------------')


Total Unique Words: 4324
-------------------
             count
count  4324.000000
mean    199.777290
std     415.279401
min       1.000000
25%      40.000000
50%      76.000000
75%     175.000000
max    5888.000000
-------------------


## Feature Selection ##

Starting with the top x words found per song in the dataset, we'll add features and record the results from our classification models

In [300]:
def plot_ROC_compute_AUC(model, model_name, X,y):
    X_val, X_val_test, y_val, y_val_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model.fit(X_val, y_val)
    y_prob = model.predict_proba(X_val_test)[:,1]
    auc = roc_auc_score(y_val_test, y_prob)
    
    return auc
    #TODO save these
    fpr, tpr, _ = roc_curve(y_val_test, y_prob)
    auc = roc_auc_score(y_val_test, y_prob)

    plt.plot(fpr, tpr)

    x = np.linspace(0,1, 100000)
    plt.plot(x, x, linestyle='--')

    plt.title('ROC Curve (Pop or Hip Hop)')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(['Logistic Regression'])
    return auc

In [301]:
def create_test_result_df(rows):
    test_results = pd.DataFrame(rows, columns=['num_words','model','accuracy','precision','recall','f1','auc'])
    test_results['accuracy'] = test_results['accuracy'].astype(np.float64)
    test_results['precision'] = test_results['precision'].astype(np.float64)
    test_results['recall'] = test_results['recall'].astype(np.float64)
    test_results['f1'] = test_results['f1'].astype(np.float64)
    test_results['auc'] = test_results['auc'].astype(np.float64)
    return test_results

In [302]:
def create_feature_df(rows):
    test_results = pd.DataFrame(rows, columns=['num_words','model','feature','importance'])
    test_results['importance'] = test_results['importance'].astype(np.float64)
    return test_results

In [319]:
def get_X_Y(word_sample_size):
    word_song_appearance, total_word_appearance = get_word_counts(all_lyrics)
    word_subset = word_song_appearance.iloc[:word_sample_size]

    remaining_lyrics = pd.merge(all_lyrics.reset_index(), word_subset[['word']], how='right', on='word')

    remaining_lyrics.set_index('track_id', inplace=True)
    del remaining_lyrics['is_test']
    tid_lyrics = remaining_lyrics.pivot(columns='word', values='count')

    music_features = ['music_duration','music_key','music_loudness', 'music_mode', 'music_tempo', 'music_time_signature']
    
    term_only = features[['term'] + music_features].reset_index().set_index('track_id')
    feature_names = music_features + list(tid_lyrics.columns)
    # complete set,= tid_index -> genre -> word_a -> .... -> word_z
    complete_set = pd.merge(term_only, tid_lyrics, left_index=True, right_index=True, how='right')
    complete_set.fillna(0, inplace=True)


    y_text = np.asarray(complete_set.iloc[:,0])
    y = np.array([1 if val=='hip hop' else 0 for val in y_text])
    X = np.asarray(complete_set.iloc[:,1:])

    return X,y,feature_names

In [305]:
def get_winning_log_model():
    return LogisticRegression(class_weight={1 : 2, 0 : 1})

In [325]:
X, y, feature_names = get_X_Y(450)
X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

X_val_fit, X_val_test, y_val_fit, y_val_test = train_test_split(X_val, y_val, test_size=test_size, random_state=random_state)

In [None]:
model = LogisticRegression(penalty='l1', class_weight={1 : 2, 0 : 1})



scaler = preprocessing.StandardScaler()

model.fit(scaler.fit_transform(X_val_fit), y_val_fit)

zipped_features = list(zip(feature_names, model.coef_[0]))
zipped_features

In [None]:
np.mean(cross_val_score(model, scaler.fit_transform(X_val), y_val, cv=5, n_jobs=-1, scoring='f1'))

In [None]:
#C = np.logspace(0, 10, 30)
#grid = GridSearchCV(estimator=get_winning_log_model(),
#param_grid=dict(C=C), cv=5, scoring='f1')
#grid.fit(scaler.fit_transform(X_val), y_val) # entire datasets were fed here

#print ('{},{}'.format(grid.best_params_, grid.best_score_))
#for params, mean_score, scores in grid.grid_scores_:
#    print ('{},{}'.format(mean_score, params));

In [326]:
model = RandomForestClassifier(n_estimators=1000, class_weight={1 : 2, 0 : 1})

np.mean(cross_val_score(model, X_val, y_val, cv=5, n_jobs=-1, scoring='f1'))

0.592940252047268

In [314]:
model.fit(X_val, y_val)


RandomForestClassifier(bootstrap=True, class_weight={0: 1, 1: 2},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [None]:
zipped_features = list(zip(feature_names, model.feature_importances_))

zipped_features.sort(key= lambda x:x[1], reverse=True)
#zipped_features

In [None]:
word_chunk_size = 10
word_upper_bound = 3010
# let's stop at 3000 words
feat_results = create_feature_df(None)

tree100 = RandomForestClassifier(n_estimators=100, class_weight={1 : 2, 0 : 1})
tree1000 = RandomForestClassifier(n_estimators=1000, class_weight={1 : 2, 0 : 1})
tree10000 = RandomForestClassifier(n_estimators=10000, class_weight={1 : 2, 0 : 1})
tree_models = [tree100, tree1000, tree10000]
tree_model_names = ['tree100', 'tree1000', 'tree10000']
log_model = LogisticRegression(penalty='l1', class_weight={1 : 2, 0 : 1})

scaler = preprocessing.StandardScaler()

zipped_features = list(zip(feature_names, model.coef_[0]))

output_file_name = 'cv_files/feature_importance_results.csv'

for word_sample_size in range(10, word_upper_bound, word_chunk_size):
    X, y, feature_names = get_X_Y(word_sample_size)
    X_val, X_test, y_val, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_val_fit, X_val_test, y_val_fit, y_val_test = train_test_split(X_val, y_val, test_size=test_size, random_state=random_state)
    
    # For the pandas rows we're generating
    number_word_columns = [str(word_sample_size)] * len(feature_names)
    
    # Handle Logistic Regression
    log_model.fit(scaler.fit_transform(X_val_fit), y_val_fit)
    model_columns = ['log'] * len(feature_names)
    
    zipped_features = list(zip(number_word_columns, model_columns, feature_names, model.coef_[0]))
    new_results = create_feature_df(zipped_features)
    
    # Merge with our results
    feat_results = feat_results.append(new_results, ignore_index=True)
    feat_results.to_csv(output_file_name)
    
    # Now our trees
    for idx, tree in enumerate(tree_models):
        tree.fit(X_val, y_val)
        model_columns = [tree_model_names[idx]] * len(feature_names)
        zipped_features = list(zip(number_word_columns, model_columns, feature_names, tree.feature_importances_))
        new_results = create_feature_df(zipped_features)
        
        feat_results = feat_results.append(new_results, ignore_index=True)
        feat_results.to_csv(output_file_name)