# Data Pre-Processing

In [1]:

# Data processing and functions
import pandas as pd
import numpy as np
import scipy as sp
import random

# Analytics and modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Graphing and visualizing
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib import cm
from pylab import savefig

from scipy import sparse, io
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from gensim import corpora, models, similarities
import Pyro4
import os
# Setting graphing preferences
sns.set(style="darkgrid", color_codes=True)

# Printing
import locale

# Show plots locally
locale.setlocale( locale.LC_ALL, '' )

%matplotlib inline



In [None]:
#Create temp folder to store corpus and dictionary
import tempfile
TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.format(TEMP_FOLDER))

In [2]:
# Load the lyrics data
path = r'C:\Users\abhij\Desktop\UVa Coursework\SYS 6018\lyric_analysis'
file = "/Song_sample_file.csv"

df = pd.read_csv(path+file)

In [4]:
#Drop index from merge and check shape
df.drop("Unnamed: 0", axis = 1, inplace = True)
df.shape

(46574, 5)

In [None]:
#Split data to separate eTraina dn test
X_train, X_test, y_train, y_test = train_test_split(df[["title", "artist", "full_lyrics"]], 
                                                    df.chart, random_state=42)
train = pd.concat([X_train,y_train], axis = 1)
test = pd.concat([X_test,y_test], axis = 1)

# Latent Dirichlet Allocation (LDA)

In [None]:
#Separate lyrical content to feature engineer
train_lyrics = train["full_lyrics"]
test_lyrics = test["full_lyrics"]

In [None]:
#Remove common words and tokenize
stoplist = set('for a of the and to in'.split())
train_lyrics = [[word for word in lyric.lower().split() if word not in stoplist]
          for lyric in train_lyrics]

In [None]:
#Remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for lyric in train_lyrics:
     for token in lyric:
        frequency[token] += 1

train_lyrics = [[token for token in lyric if frequency[token] > 1]
          for lyric in train_lyrics]

In [None]:
#Build the dictionary
train_dict = corpora.Dictionary(train_lyrics)
train_dict.save(os.path.join(TEMP_FOLDER, 'train.dict'))  # store the dictionary, for future reference

In [None]:
#Build corpus
train_corpus = [train_dict.doc2bow(text) for text in train_lyrics]
corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER,'train.mm'), train_corpus)  # store to disk, for later use

In [None]:
#Run LDA model with 20 topics
lda_train = gensim.models.LdaMulticore(corpus=train_corpus, id2word=train_dict, num_topics=20, chunksize=20, passes=1, workers = 4, minimum_probability = 0.00001)

In [None]:
#Obtain LDA weights for all songs based on trained model and convert to data frame
lda_weights = [dict(lda_train[train_dict.doc2bow(x.split())]) for x in df.full_lyrics]

lda_weights = pd.DataFrame(lda_weights)

In [None]:
#Save weights for modelling
lda_weights.to_csv("lda_weights.csv")