# Lyric DEA #

## Imports, Inits, and Method definitions ##

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()

%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import importlib

import mcnulty_methods
import word_utils
importlib.reload(mcnulty_methods);
importlib.reload(word_utils);
from mcnulty_methods import get_formatted_feature_df, get_lyrics_for_tracks
from word_utils import get_word_counts, generate_word_charts


In [None]:
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 13
mpl.rcParams['ytick.labelsize'] = 13

## Fetch Tracks for Particular Genres

In [None]:
conn = create_engine('postgresql://*******@localhost:5432/mcnulty_songs').raw_connection()
cursor = conn.cursor()

In [None]:
features = get_formatted_feature_df(conn)

In [None]:
features.shape

In [None]:
features.sample(5)

## Fetch Lyrics from Tracks ##

In [None]:
genre_labels = ['hip hop', 'pop']
unique_words = set()

all_lyrics = None
hiphop_lyrics = None
pop_lyrics = None

for genre_label in genre_labels:
    genre_df = features[(features['term'] == genre_label)]

    genre_ids = genre_df['track_id']
    
    genre_lyrics = get_lyrics_for_tracks(conn, genre_ids)

    
    if genre_label == 'pop':
        pop_lyrics = genre_lyrics
    elif genre_label == 'hip hop':
        hiphop_lyrics = genre_lyrics
        
    if all_lyrics is None:
        all_lyrics = genre_lyrics
    else:
        all_lyrics = pd.concat([all_lyrics, genre_lyrics])
    
    

In [None]:
total_count_of_words = all_lyrics.groupby('word')['count'].sum().reset_index()

total_count_of_words.sort_values('count', ascending=False, inplace=True)

total_count_of_words.head(10)

In [None]:
track_word_counts = all_lyrics.groupby('track_id')['count'].sum()

#track_word_counts.sort_values('count', ascending=False, inplace=True)

## Hip Hop: Analyze per track word counts ##

In [None]:
track_word_counts = hiphop_lyrics.groupby('track_id')['count'].sum().reset_index()

track_word_counts.sort_values('count', ascending=False, inplace=True)

track_word_counts['count'].describe()

## Pop: Analyze per track word counts ##

In [None]:
track_word_counts = pop_lyrics.groupby('track_id')['count'].sum().reset_index()

track_word_counts.sort_values('count', ascending=False, inplace=True)

track_word_counts['count'].describe()

May want to consider dropping tracks with very few words

In [None]:
features.set_index('track_id', inplace=True)

## Word Analysis and Reshaping for Modeling ##