Preprocessing

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt

pd.options.display.max_colwidth = 200
%matplotlib inline

In [11]:
#Reading in csv file into dataframe
rawdata = pd.read_csv("C:/Users/ajfly/Spotify-2000manualcleaned.csv")
rawdata.head(2)

Unnamed: 0,Index,Title,Artist,Top Genre,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
0,1,Sunrise,Norah Jones,adult standards,2004,157,30,53,-14,11,68,201,94,3,71
1,2,Black Night,Deep Purple,album rock,2000,135,79,50,-11,17,81,207,17,7,39


In [13]:
#For our Recommendation Engine, the team deemed only three columns were needed
needed_data = rawdata[["Title", "Artist", "Top Genre"]]
needed_data.head(1)

Unnamed: 0,Title,Artist,Top Genre
0,Sunrise,Norah Jones,adult standards


In [21]:
#Simple EDA
print('The shape of the data: ', needed_data.shape)
print()
print('The sum of null values: ', needed_data.isnull().sum())

The shape of the data:  (1994, 3)

The sum of null values:  Title        0
Artist       0
Top Genre    0
dtype: int64


In [17]:
#normalizing data
#Put each column into a list
titles = needed_data['Title'].tolist()
artists = needed_data['Artist'].tolist()
genres = needed_data['Top Genre'].tolist()
#Combine lists to create a corpus like data
corpus = titles + artists + genres

In [18]:
#Normalizing the data by turning text into lower case, removing special characters and whitespaces, and tokenization.
tokenizer = nltk.WordPunctTokenizer()
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

def normalized(text):
  text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
  text = text.lower()
  text  = text.strip()
  tokens = tokenizer.tokenize(text)

  filtered_tokens = [token for token in tokens if token not in stop_words]
  text   = ' '.join(filtered_tokens)
  return text
normalizedText = np.vectorize(normalized)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ajfly\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
normCorpus = normalizedText(corpus)

In [23]:
#implementing BOW Model
from sklearn.feature_extraction.text import CountVectorizer
# get bag of words features in sparse format
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(normCorpus)

In [24]:
#Implementing TF-IDF Model
from sklearn.feature_extraction.text import TfidfTransformer

tt = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
tt_matrix = tt.fit_transform(cv_matrix)

tt_matrix = tt_matrix.toarray()
vocab = cv.get_feature_names()
pd.DataFrame(np.round(tt_matrix, 2), columns=vocab)

Unnamed: 0,aan,aanzoek,abba,abel,absolute,absolution,accidentally,acda,acdc,ace,...,zone,zonnestralen,zou,zoutelande,zucchero,zullen,zuuje,zwart,zweet,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#Implementing text similary (Cosine)
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(tt_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5972,5973,5974,5975,5976,5977,5978,5979,5980,5981
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
5978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
5979,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.267202,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
#Implementing Toplic Extraction
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation()
dt_matrix = lda.fit_transform(cv_matrix)
features = pd.DataFrame(dt_matrix)
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.550000,0.050000,0.050000,0.050000
1,0.699981,0.033349,0.033335,0.033333,0.033335,0.033333,0.033333,0.033333,0.033333,0.033333
2,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.700000,0.033333,0.033333,0.033333
3,0.050000,0.550000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000
4,0.274995,0.025000,0.025009,0.025000,0.025002,0.025000,0.524995,0.025000,0.025000,0.025000
...,...,...,...,...,...,...,...,...,...,...
5977,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.700000
5978,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.033333,0.700000
5979,0.033333,0.033333,0.366667,0.033333,0.033333,0.033333,0.033333,0.366657,0.033343,0.033333
5980,0.550000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000,0.050000
