In [1]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn import cross_validation
from sklearn.pipeline import Pipeline



In [2]:
PATH = 'train'

In [29]:
X = []
y = []
labels = ['logic', 'tyler', 'kendrick']

for label in os.listdir(PATH):
    if label != ".DS_Store" and label in labels:
        classPath = PATH + '/' + label
        for song in os.listdir(classPath):
            if song != ".DS_Store":
                songPath = classPath + '/' + song
                with open(songPath, "r") as f:
                    X.append(f.read())
                y.append(label)

X = np.array(X)
y = np.array(y)

In [23]:
tfidf = TfidfVectorizer(stop_words='english', min_df=1, ngram_range=(1,3))
tfidf.fit_transform(X, y)
tfidf.idf_.shape

(69895,)

In [24]:
TESTPATH = 'test'

X_test = []
y_test = []
labels = ['logic', 'tyler', 'kendrick']

for label in os.listdir(TESTPATH):
    if label != ".DS_Store" and label in labels:
        classPath = TESTPATH + '/' + label
        for song in os.listdir(classPath):
            if song != ".DS_Store":
                songPath = classPath + '/' + song
                with open(songPath, "r") as f:
                    X_test.append(f.read())
                y_test.append(label)
                
X_test = np.array(X_test)
y_test = np.array(y_test)
y_test

array(['tyler', 'tyler', 'tyler', 'tyler', 'tyler', 'tyler', 'tyler',
       'tyler', 'tyler', 'logic', 'logic', 'logic', 'logic', 'logic',
       'logic', 'logic', 'logic', 'logic', 'kendrick', 'kendrick',
       'kendrick', 'kendrick', 'kendrick', 'kendrick', 'kendrick',
       'kendrick'], 
      dtype='<U8')

In [25]:
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MLPClassifier(hidden_layer_sizes=(256, 128, 128, 64), learning_rate_init=.001, max_iter=1000, activation="tanh")

model = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english', min_df=1, ngram_range=(1,2))),
    ('classifier', clf,)
])
model.fit(X, y)

Pipeline(steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=Tr...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))])

In [26]:
count = 0 
prediction = model.predict(X_test)
for song, y, yhat in zip(X_test, y_test, prediction):
    if y == yhat:
        count += 1
    else:
        print(y, yhat, song)
count/len(y_test)

kendrick tyler      Smoking out, pouring up, keep that lean up in my cup All my car got leather and wood in my hood we call it buck Everybody wanna ball, holla at broads at the mall If he up, watch him fall, I can't fuck with y'all (Pussy ass ho niggas) I can't fuck with y'all (Bitches all up in my business) I can't fuck with y'all (Industry of counterfeits) I can't fuck with y'all   Taking off when you landing Bitch niggas gonna throw tantrums And I'm dancing on them stars The galaxy ain't got room for y'all Ain't nothing gonna happen soon for y'all While I'm here and every day I hear Your bullshit, self-pity Reason why you never dealt with me Reason why your girl dealt with me Hands up, in the building, we get busy and say R.I.P. Aaliyah, R.I.P., yep, R.I.P. Aaliyah, R.I.P., yep That's exactly what this sound like A to the A to the L-I-Y-A-H, give it up 2 times Then give it right back, don't blow my high   Smoking out, pouring up, keep that lean up in my cup All my car got leather an

0.8846153846153846

In [30]:
score = cross_validation.cross_val_score(model, X, y, cv=3)
np.mean(score)

0.83748353096179173

In [27]:
#Tune hyperparameters 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
cvals = [.0001]
best = 0 
best_c = 0
for c in cvals:
    model = Pipeline([
        ('vectorizer', CountVectorizer(stop_words='english', min_df=1, ngram_range=(1,2))),
        ('transformer', TfidfTransformer()),
        ('classifier', SGDClassifier(alpha=c)),
    ])
    model.fit(X, y)
    scores = cross_validation.cross_val_score(model, X_test, y_test, cv=3)
    score = np.mean(scores)
    print(score)
    if score > best:
        best = score
        best_c = c
print(best, best_c)


0.699074074074
0.699074074074 0.0001


In [96]:
from nltk.corpus import stopwords
from collections import Counter
import re
labels = ['kendrick', 'logic', 'tyler']

stops = list(stopwords.words('english'))

for label in os.listdir(PATH):
    if label != ".DS_Store" and label in labels:
        classPath = PATH + '/' + label
        cv = CountVectorizer(stop_words='english', min_df=1, ngram_range=(1,2))
        songs = []
        for song in os.listdir(classPath):
            if song != ".DS_Store":
                songPath = classPath + '/' + song
                with open(songPath, "r") as f:
                    lyrics = f.read().split(" ")
                    lyrics = [re.sub(r'[^\w]', ' ', word.lower()).replace(" ", "") for word in lyrics]
                    songs += (lyrics)

        classPath = TESTPATH + '/' + label
        for song in os.listdir(classPath):
            if song != ".DS_Store":
                songPath = classPath + '/' + song
                with open(songPath, "r") as f:
                    lyrics = f.read().split(" ")
                    lyrics = [re.sub(r'[^\w]', ' ', word.lower()).replace(" ", "") for word in lyrics]
                    songs += (lyrics)

#         dtm = cv.fit_transform(songs).toarray()
#         vocab = np.array(cv.get_feature_names())
        counts = Counter(songs)
        remove = []
        for word in counts:
            if word in stops:
                remove.append(word)
        remove += ["im", 'that', "its", "me", ""]
        for word in remove:
            del counts[word]
        print(label)
        print(counts)


tyler
Counter({'fuck': 335, 'like': 275, 'nigga': 232, 'fucking': 214, 'dont': 212, 'got': 195, 'shit': 182, 'bitch': 149, 'cause': 142, 'know': 136, 'get': 129, 'youre': 129, 'yeah': 122, 'time': 118, 'niggas': 109, 'thats': 109, 'see': 107, 'aint': 97, 'back': 91, 'go': 83, 'golf': 80, 'call': 80, 'take': 74, 'want': 70, 'boy': 70, 'dick': 70, 'man': 70, 'find': 67, 'one': 67, 'em': 64, 'wang': 64, 'oh': 61, 'wolf': 60, 'fuckin': 59, 'wanna': 59, 'never': 58, 'love': 53, 'keep': 51, 'girl': 51, 'cant': 51, 'could': 49, 'think': 48, 'say': 48, 'feel': 48, 'ill': 47, 'couple': 46, 'let': 46, 'watch': 45, 'suck': 45, 'come': 43, 'look': 43, 'getting': 43, 'said': 43, 'give': 43, 'yall': 42, 'gotta': 42, 'make': 42, 'good': 42, 'gang': 41, 'tyler': 41, 'even': 41, 'tell': 40, 'bitches': 38, 'lets': 38, 'really': 38, 'la': 37, 'right': 34, 'hope': 34, 'life': 34, 'kill': 34, 'need': 33, 'radical': 32, 'well': 32, 'us': 32, 'new': 31, 'around': 29, 'little': 28, 'ass': 28, 'os': 28, 'run':