In [63]:
#As always, we import everything
import pandas as pd
import numpy as np 

import os
import re

import seaborn as sns

import sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2

from sklearn.metrics import confusion_matrix
from sklearn import metrics

from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = './' + 'data'

# README

In [64]:
# Load the dataframes

year_artist_name_title_genre = pd.read_csv(os.path.join(data_dir + "/year_artist_name_title_genre.csv")).set_index('track_id')

train_lyrics_dataframe = pd.read_csv(data_dir + '/train_lyrics.csv').drop('Unnamed: 0', axis = 1)
msd_lyrics_dataframe = pd.read_csv(data_dir + '/msd_lyrics.csv').drop('Unnamed: 0', axis = 1)

msd_lyrics = msd_lyrics_dataframe['Lyrics']

train_lyrics = train_lyrics_dataframe['Lyrics']

frames = [train_lyrics, msd_lyrics]

final_lyrics = pd.concat(frames).reset_index().drop('index', axis = 1)

In [62]:
def label_lyrics (row):
    if row['labels'] == 1 :
        return 'Feminist'
    elif row['labels'] == 0:
        return 'Neutral'
    else :
        return 'Sexist'


train_lyrics_dataframe['Category'] = train_lyrics_dataframe.apply(label_lyrics, axis=1)

col = ['Category', 'Lyrics']
train_lyrics_dataframe = train_lyrics_dataframe[col]
train_lyrics_dataframe.head()

Unnamed: 0,Category,Lyrics
0,Feminist,Red One Sugababes Girls bring the fun of life ...
1,Feminist,I guess it was yourself you were involved with...
2,Feminist,Bill collectors at my door What can you do for...
3,Feminist,I ain't cooking all day (I ain't your mama!) I...
4,Feminist,All hands on deck All in front all in the back...
5,Feminist,Hit it This ain't no disco It ain't no country...
6,Feminist,He's home again from another day She smiles at...
7,Feminist,And now little lady if you'll kindly step up t...
8,Feminist,Every time that I sell myself to you I feel a ...
9,Feminist,(Treat him like a lady) Uh-huh Uh-huh I like c...


In [32]:
train_lyrics_dataframe.columns = ['Category', 'Lyrics']
train_lyrics_dataframe['category_id'] = train_lyrics_dataframe['Category'].factorize()[0]
category_id_df = train_lyrics_dataframe[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)
train_lyrics_dataframe.head()

Unnamed: 0,Category,Lyrics,category_id
0,Feminist,Red One Sugababes Girls bring the fun of life ...,0
1,Feminist,I guess it was yourself you were involved with...,0
2,Feminist,Bill collectors at my door What can you do for...,0
3,Feminist,I ain't cooking all day (I ain't your mama!) I...,0
4,Feminist,All hands on deck All in front all in the back...,0


In [None]:
# create array of unigrams and bigrams features to 

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features = tfidf.fit_transform(final_lyrics.Lyrics).toarray()
labels = train_lyrics_dataframe.category_id
features.shape

In [None]:

N = 10
for Category, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features[0:532], labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Category))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

In [None]:

models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=42),
    
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features[0:532], labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    
    sns.boxplot(x='model_name', y='accuracy', data=cv_df)
    sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
                  size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
model = LinearSVC()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features[0:532], labels, lyrics_dataframe.index, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

conf_mat = confusion_matrix(y_test, y_pred)
#fig, ax = plt.subplot()#figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Category.values, yticklabels=category_id_df.Category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:

print(metrics.classification_report(y_test, y_pred, target_names=train_lyrics['Category'].unique()))

In [None]:
model.fit(features[0:532], labels)
N = 10
for Product, category_id in sorted(category_to_id.items()):
    indices = np.argsort(model.coef_[category_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(Product))
    print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
    print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))

In [None]:
# Uncomment this if you want to predict again

# DANGER ! May kill your kernel if you don't have enough memory

# y_pred = model.predict(features[532:])

In [None]:
temp = open( data_dir + "/all_y_preds.txt", "r")
y_pred = temp.read().split('\n')[:-1] # splitting by new line means last element is new line 


y_pred = [elem[:5] for elem in y_pred]
y_pred = list(map(float, y_pred))
y_pred = list(map(int, y_pred))
y_pred[0:5]

# y_pred = labels.append(y_pred)

len(y_pred)

In [None]:
msd_lyrics_dataframe['labels'] = y_pred

msd_lyrics_dataframe['Class'] = msd_lyrics_dataframe.apply(label_lyrics, axis=1)

msd_lyrics_dataframe.head()

In [None]:
Final_with_track_id = pd.merge(year_artist_title_genre, msd_lyrics_dataframe,  how='inner', left_on=['artist_name','title'], 
                 right_on = ['Artists','Titles'])
Final_with_track_id.head()

In [None]:
Final_with_track_id = Final_with_track_id.drop(columns=['Artists','Titles', 'Lyrics'])#,inplace=True)
Final_with_track_id.shape

In [None]:
Final_with_track_id.to_csv('Final_With_Track_ID.csv')