In [1]:
import pandas as pd
import numpy as np
import string
import warnings
import math
import nltk
import json
from langdetect import detect
from libsvm import *
from nltk.stem import PorterStemmer
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV
from IPython.display import display

from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import svm
import dask.dataframe as dd
import multiprocessing
import swifter
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('words')
from spellchecker import SpellChecker

spell = SpellChecker()

# pd.set_option('display.max_colwidth', None)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anishajauhari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/anishajauhari/nltk_data...
[nltk_data]   Package words is already up-to-date!


## Dataset

In [2]:
rock_pop_dataset = pd.read_csv("../../data/rockpopdataset.csv")
rock_pop_dataset["filtered_lyrics"].fillna('', inplace=True)


## Term Frequency Inverse Document Frequency Features

In [6]:
tf_vec = TfidfVectorizer(analyzer = 'word', min_df = 5, max_df = 0.95)
X = tf_vec.fit_transform(rock_pop_dataset["filtered_lyrics"]).toarray()

Empty DataFrame
Columns: [Unnamed: 0, song, year, artist, genre, lyrics, filtered_lyrics]
Index: []


In [9]:
tfidf_df = pd.DataFrame(X, columns = tf_vec.get_feature_names())

In [10]:
importance = np.argsort(np.asarray(X.sum(axis=0)).ravel())[::-1]
tfidf_feature_names = np.array(tf_vec.get_feature_names())
tf_idf_100 = tfidf_feature_names[importance[:100]]
tf_idf_200 = tfidf_feature_names[importance[:200]]
tf_idf_500 = tfidf_feature_names[importance[:500]]

In [11]:
tfidf_df_100 = tfidf_df.loc[:, tfidf_df.columns.isin(tf_idf_100)]
tfidf_df_200 = tfidf_df.loc[:, tfidf_df.columns.isin(tf_idf_200)]
tfidf_df_500 = tfidf_df.loc[:, tfidf_df.columns.isin(tf_idf_500)]

tfidf_df_100["index"] = rock_pop_dataset.index
tfidf_df_200["index"] = rock_pop_dataset.index
tfidf_df_500["index"] = rock_pop_dataset.index

# tfidf_df_100["misspelled_words"] = rock_pop_dataset["misspelled_words"]
# tfidf_df_200["misspelled_words"] = rock_pop_dataset["misspelled_words"]
# tfidf_df_500["misspelled_words"] = rock_pop_dataset["misspelled_words"]

# tfidf_df_100["unique_words"] = rock_pop_dataset["unique_words"]
# tfidf_df_200["unique_words"] = rock_pop_dataset["unique_words"]
# tfidf_df_500["unique_words"] = rock_pop_dataset["unique_words"]
# #'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness','acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo'
# tfidf_df_100["slang_words"] = rock_pop_dataset["slang_words"]
# tfidf_df_200["slang_words"] = rock_pop_dataset["slang_words"]
# tfidf_df_500["slang_words"] = rock_pop_dataset["slang_words"]


In [12]:
tfidf_df_100.to_csv("../../data/tfidf_100")
tfidf_df_200.to_csv("../../data/tfidf_200")
tfidf_df_500.to_csv("../../data/tfidf_500")

## Information Gain Features

In [3]:
vectorizer = CountVectorizer(analyzer='word', min_df = 5, max_df = 0.95)
X = vectorizer.fit_transform(rock_pop_dataset["filtered_lyrics"]).toarray()

In [4]:
feature_terms = pd.DataFrame(vectorizer.get_feature_names(), columns=["Term"])
X = pd.DataFrame(X, columns = vectorizer.get_feature_names())
# X

In [5]:
rock = rock_pop_dataset.loc[rock_pop_dataset["genre"] == "Rock"]["filtered_lyrics"]
pop = rock_pop_dataset.loc[rock_pop_dataset["genre"] == "Pop"]["filtered_lyrics"]


In [6]:
# feature_terms["rock"] = feature_terms["Term"].swifter.apply(lambda a : sum(rock.str.contains(a)))
# feature_terms["pop"] = feature_terms["Term"].swifter.apply(lambda a : sum(pop.str.contains(a)))

In [7]:
feature_terms["rock"] = dd.from_pandas(feature_terms["Term"], npartitions=4*multiprocessing.cpu_count()).map_partitions(lambda dframe: dframe.apply(lambda a: sum(rock.str.contains(a)))).compute(scheduler='processes')
feature_terms["pop"] = dd.from_pandas(feature_terms["Term"], npartitions=4*multiprocessing.cpu_count()).map_partitions(lambda dframe: dframe.apply(lambda a: sum(pop.str.contains(a)))).compute(scheduler='processes')


In [8]:
# feature_terms["rock"] = feature_terms["Term"].apply(lambda a : sum(rock.str.contains(a)))

In [9]:
# feature_terms["pop"] = feature_terms["Term"].apply(lambda a : sum(pop.str.contains(a)))

In [10]:
# feature_terms

In [11]:
feature_terms["not_rock"] = 94386 - feature_terms["rock"]

In [12]:
feature_terms["not_pop"] = 31157 - feature_terms["pop"]

In [13]:
def informationGain(term, entropy):
    prob_rock = term["rock"]/(term["rock"] + term["pop"])
    prob_pop = term["pop"]/(term["rock"] + term["pop"])
    prob_not_rock = term["not_rock"]/(term["not_rock"] + term["not_pop"])
    prob_not_pop = term["not_pop"]/(term["not_rock"] + term["not_pop"])
    total_prob = (term["rock"] + term["pop"])/125543
    not_total_prob = (term["not_rock"] + term["not_pop"])/125543
    
    final_entropy = (total_prob*(-1*((prob_rock*math.log2(prob_rock) if prob_rock!=0 else 0) + (prob_pop*math.log2(prob_pop)if prob_pop!=0 else 0)))) + (not_total_prob*(-1*((prob_not_rock*math.log2(prob_not_rock) if prob_not_rock!=0 else 0) + (prob_not_pop*math.log2(prob_not_pop)if prob_not_pop!=0 else 0))))
        
    return entropy - final_entropy

In [14]:
entropy= -1 * (94386/125543) * math.log2(94386/125543) - (31157/125543) * math.log2(31157/125543)
feature_terms["info_gain"] = feature_terms.apply(lambda x : informationGain(x, entropy), axis = 1)

In [15]:
feature_terms = feature_terms.sort_values(by=["info_gain"], ascending=False)
# feature_terms

In [16]:
# feature_terms[:100]["Term"]

In [17]:
ig_terms_100 = feature_terms[:100]["Term"]
ig_terms_200 = feature_terms[:200]["Term"]
ig_terms_500 = feature_terms[:500]["Term"]

ig_100 = X.loc[:, X.columns.isin(ig_terms_100)]
ig_200 = X.loc[:, X.columns.isin(ig_terms_200)]
ig_500 = X.loc[:, X.columns.isin(ig_terms_500)]

In [18]:
# ig_500

In [19]:
ig_100["index"] = rock_pop_dataset.index
ig_200["index"] = rock_pop_dataset.index
ig_500["index"] = rock_pop_dataset.index


In [20]:
ig_100.to_csv("../../data/ig_100")
ig_200.to_csv("../../data/ig_200")
ig_500.to_csv("../../data/ig_500")