## Import Packages

In [1]:
from google.colab import (drive, files)
import pandas as pd
import numpy as np
import sklearn
import os

import collections
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
import sklearn.preprocessing as preproc
from sklearn.feature_extraction import text

import sklearn.metrics as metrics


## Functions & Classes to help format the data

In [2]:
from __future__ import print_function
from __future__ import division

from collections import defaultdict, Counter

## Class to create a Vocabulary Object -- source: w266/common/Vocabulary.py
class Vocabulary(object):

    START_TOKEN = u"<s>"
    END_TOKEN   = u"</s>"
    UNK_TOKEN   = u"<unk>"

    def __init__(self, tokens, size=None,
                 progressbar=lambda l:l):
        """Create a Vocabulary object.
        Args:
            tokens: iterator( string )
            size: None for unlimited, or int > 0 for a fixed-size vocab.
                  Vocabulary size includes special tokens <s>, </s>, and <unk>
            progressbar: (optional) progress bar to wrap iterator.
        """
        self.unigram_counts = Counter()
        self.bigram_counts = defaultdict(lambda: Counter())
        prev_word = None
        for word in progressbar(tokens):  # Make a single pass through tokens
            self.unigram_counts[word] += 1
            self.bigram_counts[prev_word][word] += 1
            prev_word = word
        self.bigram_counts.default_factory = None  # make into a normal dict

        # Leave space for "<s>", "</s>", and "<unk>"
        top_counts = self.unigram_counts.most_common(None if size is None else (size - 3))
        vocab = ([self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN] +
                 [w for w,c in top_counts])

        # Assign an id to each word, by frequency
        self.id_to_word = dict(enumerate(vocab))
        self.word_to_id = {v:k for k,v in self.id_to_word.items()}
        self.size = len(self.id_to_word)
        if size is not None:
            assert(self.size <= size)

        # For convenience
        self.wordset = set(self.word_to_id.keys())

        # Store special IDs
        self.START_ID = self.word_to_id[self.START_TOKEN]
        self.END_ID = self.word_to_id[self.END_TOKEN]
        self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

    @property
    def num_unigrams(self):
        return len(self.unigram_counts)

    @property
    def num_bigrams(self):
        return len(self.bigram_counts)

    def __contains__(self, key):
        if isinstance(key, int):
            return (key > 0 and key < self.size)
        else:
            return key in self.word_to_id

    def words_to_ids(self, words):
        return [self.word_to_id.get(w, self.UNK_ID) for w in words]

    def ids_to_words(self, ids):
        return [self.id_to_word[i] for i in ids]

    def pad_sentence(self, words, use_eos=True):
        ret = [self.START_TOKEN] + words
        if use_eos:
          ret.append(self.END_TOKEN)
        return ret

    def sentence_to_ids(self, words, use_eos=True):
        return self.words_to_ids(self.pad_sentence(words, use_eos))

    def ordered_words(self):
        """Return a list of words, ordered by id."""
        return self.ids_to_words(range(self.size))

    def write_flat_file(self, filename):
        """Write the vocabulary list to a flat file."""
        ordered_words = self.ids_to_words(range(self.size))
        with open(filename, 'w') as fd:
            for word in ordered_words:
                fd.write(word + "\n")
        print("Vocabulary ({:,} words) written to '{:s}'".format(len(ordered_words),
                                                               filename))

    def write_projector_config(self, checkpoint_dir, tensor_name):
        """Write metadata for TensorBoard Embeddings Projector."""
        import os
        if not os.path.isdir(checkpoint_dir):
            os.mkdir(checkpoint_dir)
        metadata_file = os.path.join(checkpoint_dir, "metadata.tsv")
        self.write_flat_file(metadata_file)
        # Write projector config pb
        projector_config_file = os.path.join(checkpoint_dir,
                                             "projector_config.pbtxt")
        with open(projector_config_file, 'w') as fd:
            contents = """embeddings {
              tensor_name: "%s"
              metadata_path: "metadata.tsv"
            }""" % tensor_name
            fd.write(contents)
        print("Projector config written to {:s}".format(projector_config_file))


In [3]:
## Function to create a Sparse Bag of Words Matrix -- source: w266/common/utils.py
def id_lists_to_sparse_bow(id_lists, vocab_size):
    """Convert a list-of-lists-of-ids to a sparse bag-of-words matrix.
    Args:
        id_lists: (list(list(int))) list of lists of word ids
        vocab_size: (int) vocab size; must be greater than the largest word id
            in id_lists.
    Returns:
        (scipy.sparse.csr_matrix) where each row is a sparse vector of word
        counts for the corresponding example.
    """
    from scipy import sparse
    ii = []  # row indices (example ids)
    jj = []  # column indices (token ids)
    for row_id, ids in enumerate(id_lists):
        ii.extend([row_id]*len(ids))
        jj.extend(ids)
    x = sparse.csr_matrix((np.ones_like(ii), (ii, jj)),
                          shape=[len(id_lists), vocab_size])
    return x

# Import Data

In [6]:
drive.mount('/content/gdrive', force_remount=True)
path = "/content/gdrive"
os.chdir(path)

Mounted at /content/gdrive


In [7]:
## Get our cleaned data (from the DataCreation2.ipynb), which is stored in good_lyrics_data.csv
df = pd.read_csv('MyDrive/W266_Final_Project/good_lyrics_data.csv')
df

Unnamed: 0,Year,Yearly Rank,Title,Artist(s),Lyrics,Num Chars,Num Words,Decade
0,1960,2,"""Cathy's Clown""",The Everly Brothers,Cathy’s Clown Lyrics[Chorus] Don't want your l...,827,156,1960s
1,1960,8,"""Stuck on You""",Elvis Presley,Stuck on You Lyrics[Verse 1] You can shake an ...,1242,242,1960s
2,1960,9,"""The Twist""",Chubby Checker,The Twist Lyrics[Chorus:] Come on baby let's d...,754,147,1960s
3,1960,14,"""El Paso""",Marty Robbins,El Paso Lyrics[Verse 1] Out in the West Texas ...,2465,496,1960s
4,1960,15,"""Alley Oop""",The Hollywood Argyles,"Alley-Oop Lyrics[Intro] (Oop-oop, oop, oop-oop...",1859,299,1960s
...,...,...,...,...,...,...,...,...
3542,2021,94,"""Single Saturday Night""",Cole Swindell,Single Saturday Night Lyrics[Verse 1] I was ou...,2038,390,2020s
3543,2021,95,"""Things a Man Oughta Know""",Lainey Wilson,Things a Man Oughta Know Lyrics[Verse 1] I can...,1341,298,2020s
3544,2021,96,"""Throat Baby (Go Baby)""",BRS Kash,Throat Baby (Go Baby) Lyrics[Intro] (What's ha...,3042,615,2020s
3545,2021,97,"""Tombstone""",Rod Wave,"Tombstone Lyrics[Intro] Damn, this motherfucke...",2086,393,2020s


### Baseline Models


1.   Predict Majority Class
2.   Logistic Regression
3.   SVMs

### BERT Classification

1. On first 512 words
2. On just first verse
3. On just chorus
4. On first verse + chorus
5. Using BigBird/transformer extension




#### Other Thoughts
- The [Final Project FAQ](https://github.com/datasci-w266/2022-spring-main/blob/master/project/faq.md) suggests that we should focus on fewer models but more analysis of what's going on in the models
- k-fold cross-validation?
- word embeddings (BoW? TFIDF?)
- need to further clean the lyrics (ie. remove contractions, stopwords, etc.)? I kinda think NO b/c we want as much variation as possible



# Set up the Data

#### 4 steps to get text ready for the classifier (from [Assignment A4](https://github.com/datasci-w266/2022-spring-assignment-caseymcgon/blob/a7-submit/assignment/a4/Prelude.ipynb))

*   Tokenize the text into individual words (tokens)
*   Canonicalize the tokens
*   Convert the tokens to a sequence of integer IDs
*   (optional) Convert the IDs to a feature vector


#### Tokenization & Canonicalization
turn lyrics into a list of lower-cased words



In [8]:
df["tokens"] = df["Lyrics"].str.lower()
df["tokens"] = df["tokens"].str.split()
df

Unnamed: 0,Year,Yearly Rank,Title,Artist(s),Lyrics,Num Chars,Num Words,Decade,tokens
0,1960,2,"""Cathy's Clown""",The Everly Brothers,Cathy’s Clown Lyrics[Chorus] Don't want your l...,827,156,1960s,"[cathy’s, clown, lyrics[chorus], don't, want, ..."
1,1960,8,"""Stuck on You""",Elvis Presley,Stuck on You Lyrics[Verse 1] You can shake an ...,1242,242,1960s,"[stuck, on, you, lyrics[verse, 1], you, can, s..."
2,1960,9,"""The Twist""",Chubby Checker,The Twist Lyrics[Chorus:] Come on baby let's d...,754,147,1960s,"[the, twist, lyrics[chorus:], come, on, baby, ..."
3,1960,14,"""El Paso""",Marty Robbins,El Paso Lyrics[Verse 1] Out in the West Texas ...,2465,496,1960s,"[el, paso, lyrics[verse, 1], out, in, the, wes..."
4,1960,15,"""Alley Oop""",The Hollywood Argyles,"Alley-Oop Lyrics[Intro] (Oop-oop, oop, oop-oop...",1859,299,1960s,"[alley-oop, lyrics[intro], (oop-oop,, oop,, oo..."
...,...,...,...,...,...,...,...,...,...
3542,2021,94,"""Single Saturday Night""",Cole Swindell,Single Saturday Night Lyrics[Verse 1] I was ou...,2038,390,2020s,"[single, saturday, night, lyrics[verse, 1], i,..."
3543,2021,95,"""Things a Man Oughta Know""",Lainey Wilson,Things a Man Oughta Know Lyrics[Verse 1] I can...,1341,298,2020s,"[things, a, man, oughta, know, lyrics[verse, 1..."
3544,2021,96,"""Throat Baby (Go Baby)""",BRS Kash,Throat Baby (Go Baby) Lyrics[Intro] (What's ha...,3042,615,2020s,"[throat, baby, (go, baby), lyrics[intro], (wha..."
3545,2021,97,"""Tombstone""",Rod Wave,"Tombstone Lyrics[Intro] Damn, this motherfucke...",2086,393,2020s,"[tombstone, lyrics[intro], damn,, this, mother..."


#### Conversion to IDs
Create vocab dictionary, then extract IDs

In [9]:
## Flatten the series of lists of tokens into 1 single series of tokens
all_tokens = df["tokens"].apply(pd.Series).stack().reset_index(drop = True)

## Create the Vocabulary Dictionary {word : id}
vocab = Vocabulary(all_tokens, size=None)  # size=None means unlimited
print("Vocabulary size: {:,}".format(vocab.size))
print("Vocabulary dict: ", vocab.word_to_id)

## Store lists of all token IDs (will be useful when creating a feature vector)
x_ids = vocab.words_to_ids(all_tokens)

# Add column to df that has the ID of each token in each set of lyrics
df["x_ids"] = df["tokens"].apply(vocab.words_to_ids)
df


Vocabulary size: 38,803


Unnamed: 0,Year,Yearly Rank,Title,Artist(s),Lyrics,Num Chars,Num Words,Decade,tokens,x_ids
0,1960,2,"""Cathy's Clown""",The Everly Brothers,Cathy’s Clown Lyrics[Chorus] Don't want your l...,827,156,1960s,"[cathy’s, clown, lyrics[chorus], don't, want, ...","[20706, 3407, 624, 20, 45, 13, 17, 413, 112, 2..."
1,1960,8,"""Stuck on You""",Elvis Presley,Stuck on You Lyrics[Verse 1] You can shake an ...,1242,242,1960s,"[stuck, on, you, lyrics[verse, 1], you, can, s...","[848, 15, 3, 108, 64, 3, 38, 249, 198, 3776, 1..."
2,1960,9,"""The Twist""",Chubby Checker,The Twist Lyrics[Chorus:] Come on baby let's d...,754,147,1960s,"[the, twist, lyrics[chorus:], come, on, baby, ...","[5, 1073, 6591, 52, 15, 57, 165, 35, 5, 1073, ..."
3,1960,14,"""El Paso""",Marty Robbins,El Paso Lyrics[Verse 1] Out in the West Texas ...,2465,496,1960s,"[el, paso, lyrics[verse, 1], out, in, the, wes...","[1790, 9569, 108, 64, 58, 12, 5, 1099, 7813, 4..."
4,1960,15,"""Alley Oop""",The Hollywood Argyles,"Alley-Oop Lyrics[Intro] (Oop-oop, oop, oop-oop...",1859,299,1960s,"[alley-oop, lyrics[intro], (oop-oop,, oop,, oo...","[15400, 203, 20723, 1884, 3408, 3409, 1884, 18..."
...,...,...,...,...,...,...,...,...,...,...
3542,2021,94,"""Single Saturday Night""",Cole Swindell,Single Saturday Night Lyrics[Verse 1] I was ou...,2038,390,2020s,"[single, saturday, night, lyrics[verse, 1], i,...","[871, 1380, 94, 108, 64, 4, 60, 58, 809, 2943,..."
3543,2021,95,"""Things a Man Oughta Know""",Lainey Wilson,Things a Man Oughta Know Lyrics[Verse 1] I can...,1341,298,2020s,"[things, a, man, oughta, know, lyrics[verse, 1...","[164, 8, 131, 3923, 25, 108, 64, 4, 38, 2530, ..."
3544,2021,96,"""Throat Baby (Go Baby)""",BRS Kash,Throat Baby (Go Baby) Lyrics[Intro] (What's ha...,3042,615,2020s,"[throat, baby, (go, baby), lyrics[intro], (wha...","[2414, 57, 1755, 311, 203, 2890, 14852, 8951, ..."
3545,2021,97,"""Tombstone""",Rod Wave,"Tombstone Lyrics[Intro] Damn, this motherfucke...",2086,393,2020s,"[tombstone, lyrics[intro], damn,, this, mother...","[9565, 203, 1848, 41, 4057, 106, 739, 38727, 5..."


#### Create a Feature Vector from the IDs

Note: probably need to motivate why we'er using a BOW matrix & not a TF-IDF matrix...



In [10]:
## Create the sparse matrix
feature_matrix = id_lists_to_sparse_bow(df["x_ids"], vocab.size)
feature_matrix

<3547x38803 sparse matrix of type '<class 'numpy.longlong'>'
	with 454340 stored elements in Compressed Sparse Row format>

In [11]:
## Turn the feature matrix into a pandas DF & add the yvalue (Decade)
full_matrix = pd.DataFrame.sparse.from_spmatrix(feature_matrix)
full_matrix["Decade"] = df["Decade"]
full_matrix

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38794,38795,38796,38797,38798,38799,38800,38801,38802,Decade
0,0,0,0,5,6,0,1,2,3,2,...,0,0,0,0,0,0,0,0,0,1960s
1,0,0,0,12,2,6,7,2,5,1,...,0,0,0,0,0,0,0,0,0,1960s
2,0,0,0,2,0,7,2,8,0,1,...,0,0,0,0,0,0,0,0,0,1960s
3,0,0,0,0,28,21,10,11,9,5,...,0,0,0,0,0,0,0,0,0,1960s
4,0,0,0,1,0,9,1,4,11,0,...,0,0,0,0,0,0,0,0,0,1960s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3542,0,0,0,5,14,9,4,7,17,8,...,0,0,0,0,0,0,0,0,0,2020s
3543,0,0,0,6,17,2,13,9,30,0,...,0,0,0,0,0,0,0,0,0,2020s
3544,0,0,0,38,26,16,13,3,13,4,...,0,0,0,0,0,0,0,0,0,2020s
3545,0,0,0,3,14,13,3,1,5,8,...,0,0,0,0,0,0,0,0,0,2020s


### Train / Test Split & Separate Y values

In [12]:
## Create Train/Val/Test Split (in 2 steps)
train, rem = sklearn.model_selection.train_test_split(full_matrix, train_size = 0.7, random_state=42)
val, test = sklearn.model_selection.train_test_split(rem, train_size = 0.5, random_state = 43)

print("Train Shape: ", train.shape)
print("Val Shape:   ", val.shape)
print("Test Shape:  ", test.shape)

Train Shape:  (2482, 38804)
Val Shape:    (532, 38804)
Test Shape:   (533, 38804)


In [13]:
## Split out X and Y for our Data
train_x, train_y = train.drop('Decade', axis=1), train["Decade"]
val_x, val_y = val.drop('Decade', axis=1), val["Decade"]
test_x,  test_y  = test.drop('Decade', axis=1), test["Decade"]
print("train_x Shape:", train_x.shape, "train_y Shape:", train_y.shape)
print("val_x Shape:", val_x.shape, "val_y Shape:", train_y.shape)
print("test_x Shaep: ", test_x.shape, "test_y Shape: ", test_y.shape)

train_x Shape: (2482, 38803) train_y Shape: (2482,)
val_x Shape: (532, 38803) val_y Shape: (2482,)
test_x Shaep:  (533, 38803) test_y Shape:  (533,)


# Baseline Models

### Predict Majority Class

In [14]:
## Find the most common decade (ie. the majority class)
print(train["Decade"].value_counts())

majority = train["Decade"].value_counts().index[0]

1990s    473
2000s    460
1980s    457
2010s    419
1970s    353
1960s    247
2020s     73
Name: Decade, dtype: int64


In [15]:
train["Maj_class"] = majority
val["Maj_class"] = majority
test["Maj_class"] = majority
train.head()

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38795,38796,38797,38798,38799,38800,38801,38802,Decade,Maj_class
944,0,0,0,19,0,41,18,12,0,0,...,0,0,0,0,0,0,0,0,1980s,1990s
199,0,0,0,0,4,10,4,8,2,3,...,0,0,0,0,0,0,0,0,1960s,1990s
3351,0,0,0,39,52,24,13,10,3,8,...,0,0,0,0,0,0,0,0,2010s,1990s
2276,0,0,0,46,39,9,11,17,16,4,...,0,0,0,0,0,0,0,0,2000s,1990s
801,0,0,0,7,0,7,2,4,56,2,...,0,0,0,0,0,0,0,0,1970s,1990s


In [16]:
acc_maj_class_train = metrics.accuracy_score(train["Decade"], train["Maj_class"])
acc_maj_class_val = metrics.accuracy_score(val["Decade"], val["Maj_class"])
acc_maj_class_test = metrics.accuracy_score(test["Decade"], test["Maj_class"])

print("Accuracy for Training Set -- Majority Classifier: ", acc_maj_class_train)
print("Accuracy for Val Set -- Majority Classifier: ", acc_maj_class_val)
print("Accuracy for Test Set -- Majority Classifier: ", acc_maj_class_test)

Accuracy for Training Set -- Majority Classifier:  0.19057211925866238
Accuracy for Val Set -- Majority Classifier:  0.18421052631578946
Accuracy for Test Set -- Majority Classifier:  0.18761726078799248


### Logistic Regression

Note: read [this stackoverflow post](https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-definitions/52388406#52388406), which encourages using the liblinear solver for large datasets (which ours certainly is!)

In [25]:
logReg = LogisticRegression(solver = 'liblinear', max_iter=100)
logReg.fit(train_x, train_y)

LogisticRegression(solver='liblinear')

In [26]:
train_y_pred = logReg.predict(train_x)
val_y_pred = logReg.predict(val_x)

In [27]:
acc_logReg_train = metrics.accuracy_score(train_y, train_y_pred)
acc_logReg_val = metrics.accuracy_score(val_y, val_y_pred)
acc_logReg_test = metrics.accuracy_score(test["Decade"], test["Maj_class"])

print("Accuracy for Training Set -- Logistic Regression: ", acc_logReg_train)
print("Accuracy for Val Set -- Logistic Regression: ", acc_logReg_val)
print("Accuracy for Test Set -- Logistic Regression: ", acc_logReg_test)

Accuracy for Training Set -- Logistic Regression:  0.9943593875906527
Accuracy for Val Set -- Logistic Regression:  0.3609022556390977
Accuracy for Test Set -- Logistic Regression:  0.18761726078799248


In [20]:
## Save for when we've finished fine-tuning on train/val
#test_y_pred = logReg.predict(test_x)
#acc_logReg_test = metrics.accuracy_score(test_y, test_y_pred)

### Support Vector Machines (SVMs)

In [21]:
train_y

944     1980s
199     1960s
3351    2010s
2276    2000s
801     1970s
        ...  
1130    1980s
1294    1980s
860     1970s
3507    2020s
3174    2010s
Name: Decade, Length: 2482, dtype: object

In [28]:
from sklearn import svm

svm_model = svm.SVC(kernel='linear', C=1, gamma=1, max_iter=100,)
svm_model.fit(train_x.to_numpy(), train_y.to_numpy())

train_y_pred = svm_model.predict(train_x.to_numpy())
val_y_pred = svm_model.predict(val_x.to_numpy())

acc_svm_train = metrics.accuracy_score(train_y, train_y_pred)
acc_svm_val = metrics.accuracy_score(val_y, val_y_pred)
acc_svm_test = metrics.accuracy_score(test["Decade"], test["Maj_class"])

print("Accuracy for Training Set -- SVMs: ", acc_svm_train)
print("Accuracy for Val Set -- SVMs: ", acc_svm_val)
print("Accuracy for Test Set -- SVMs: ", acc_svm_test)



Accuracy for Training Set -- SVMs:  0.47139403706688154
Accuracy for Val Set -- SVMs:  0.2800751879699248
Accuracy for Test Set -- SVMs:  0.18761726078799248
