In [63]:
import pandas as pd
from porter_stemmer import PorterStemmer
import re, os
from itertools import islice

train_feature_file = './out/train.data'
test_feature_file = './out/test.data'
stop_words = set([x.strip() for x in open("stopwords.english", encoding="ISO-8859-1").readlines()])
stemmer = PorterStemmer()

In [64]:
def window(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result    
    for elem in it:
        result = result[1:] + (elem,)
        yield result

# from previous assignments
def protect_meta_characters (text):
    return text.replace(",", "COMMA").replace("=","EQUALS")

def makefeat(attribute, value):
    return attribute+"="+protect_meta_characters(value)

# custom lyrics tokenization for Genius formatting
def tokenize(lyrics):
    tokens = []
    # return tokenized lyrics
    lines = lyrics.splitlines()
    for line in lines:
        if line:
            # ignore verse and intro markers etc
            if line[0] == '[' and line[-1] == ']':
                continue
            tokens.extend([protect_meta_characters(stemmer.stem_token(word.lower())) for word in re.split('[ .,?!]', line) if word.lower() not in stop_words])
    return tokens

# takes a song with lyrics and genre and creates a feature set
def generate_features(lyrics, genre, train=False):
    features = []
    tokens = tokenize(lyrics)

    # unigrams
    features.extend([makefeat('word', token) for token in tokens])

    bigrams = window(tokens, 2)
    trigrams = window(tokens, 3)

    for bigram in bigrams:
        features.append(makefeat('bigram', str(bigram)))

    for trigram in trigrams:
        features.append(makefeat('trigram', str(trigram)))

    apostrophe_count = 0
    comma_count = 0
    period_count = 0
    exclamation_count = 0
    question_count = 0
    semicolon_count = 0
    colon_count = 0
    hyphen_count = 0
    digit_count = 0
    Xx_count = 0
    XX_count = 0

    line_lengths = []
    unique_line_lengths = []

    word_lengths = []

    for line in lyrics.split('\n'):

        line_lengths.append(len(line.split()))

        unique_words = []
        w = line.split()
        if w not in unique_words:
            unique_words.append(w)
        unique_line_lengths.append(len(unique_words))

        for word in line:
            if word[0].isupper():
                if word.isupper():
                    XX_count += 1
                else:
                    Xx_count += 1
            word_lengths.append(len(word))

    avg_line_length = sum(line_lengths) / len(line_lengths)
    unique_avg_line_length = sum(unique_line_lengths) / len(unique_line_lengths)
    avg_word_length = sum(word_lengths) / len(word_lengths)

    for token in tokens:
        if token == "'":
            apostrophe_count += 1
        if token == ",":
            comma_count += 1
        if token == ".":
            period_count += 1
        if token == "!":
            exclamation_count += 1
        if token == "?":
            question_count += 1
        if token == ":":
            colon_count += 1
        if token == ";":
            semicolon_count += 1
        if token == "-":
            hyphen_count += 1
        if token.isdigit():
            digit_count += 1


    # add features for positive/negative words, positive and negative numbers, numbers of capitalized words
    features.append("apostrophe="+str(apostrophe_count))
    features.append("comma="+str(comma_count))
    features.append("period="+str(period_count))
    features.append("exclamation="+str(exclamation_count))
    features.append("question="+str(question_count))
    features.append("colon="+str(colon_count))
    features.append("semicolon="+str(semicolon_count))
    features.append("hyphen="+str(hyphen_count))

    features.append("digit="+str(digit_count))

    features.append("average_line_length"+str(avg_line_length))
    features.append("unique_avg_line_length"+str(unique_avg_line_length))
    features.append("avg_word_length"+str(avg_word_length))

    features.append("Xx_count="+str(Xx_count))
    features.append("XX_count="+str(XX_count))
    
    # add label
    features.append(genre)

    # write features to out file
    with open(train_feature_file if train else test_feature_file, 'a') as f:
        f.write(','.join(features) + '\n')




In [2]:
# load data
hiphop_df = pd.read_pickle('./hiphop_df')
pop_df = pd.read_pickle('./pop_df')

In [67]:
# erase old features
open(train_feature_file, 'w').close()
open(test_feature_file, 'w').close()

ratio = 90

# create features for training and test datasets
hiphop_df.iloc[:ratio].apply(axis=1, func=lambda row: generate_features(row['lyrics'], 'hiphop', True))
pop_df.iloc[:ratio].apply(axis=1, func=lambda row: generate_features(row['lyrics'], 'pop', True))

hiphop_df.iloc[ratio:].apply(axis=1, func=lambda row: generate_features(row['lyrics'], 'hiphop'))
pop_df.iloc[ratio:].apply(axis=1, func=lambda row: generate_features(row['lyrics'], 'pop'))




4     None
6     None
7     None
10    None
12    None
13    None
14    None
15    None
0     None
2     None
3     None
6     None
9     None
11    None
dtype: object

In [40]:
# evaluate songs 
print(tokenize(pop_df.iloc[0]['lyrics']))

['ember', 'stai', 'breez', 'feel', 'element', 'remind', 'beauti', 'bleak', 'stuck', 'long', 'light', 'breath', '', '', 'love', '', 'feel', 'time', 'jump', 'real', 'scar', 'heal', '', 'wave', 'break', 'boat', 'wave', 'brеak', 'boat', 'stone', 'crash', 'boardwalk', 'thе', 'wind', 'rush', 'tree', 'ey', 'peel', 'memori', 'fall', 'short', "could'v", 'left', 'long', 'call', '', '', 'love', '', 'feel', 'time', 'jump', 'real', 'scar', 'heal', '', 'wave', 'break', 'boat', 'wave', 'break', 'boat', 'wave', 'break', 'boat', 'wave', 'break', 'boat', 'love', '', 'feel', 'time', 'jump', 'real', 'scar', 'heal', '', 'wave', 'break', 'boat']
