<a href="https://colab.research.google.com/github/adityasharma10699/Data_Mining/blob/main/SpotifyReviews_UMAP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data Mining/Class work/Week 8 and later/SpotifyReviews.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35306 entries, 0 to 35305
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     35306 non-null  object
 1   Recommend  35306 non-null  object
dtypes: object(2)
memory usage: 551.8+ KB


In [None]:
df

Unnamed: 0,Review,Recommend
0,"Great music service, the audio is high quality...",Yes
1,Please ignore previous negative rating. This a...,Yes
2,Really buggy and terrible to use as of recently,No
3,Dear Spotify why do I get songs that I didn't ...,No
4,I love the selection and the lyrics are provid...,Yes
...,...,...
35301,One day I was able to switch between songs and...,No
35302,It was my favourite app. I feel sorry for arti...,No
35303,Back to one frkng star. First of all there's t...,No
35304,Even though it was communicated that lyrics fe...,No


In [None]:
df['Recommend'] = df['Recommend'].map({'Yes':1, 'No':0})

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35306 entries, 0 to 35305
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     35306 non-null  object
 1   Recommend  35306 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 551.8+ KB


In [None]:
# Cleaning summaries
def cleaner(summary):
    soup = BeautifulSoup(summary, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(#|@|http://|https://|www)\S*", " ", souped) # substituting hashtags, @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    """
    For more info on regular expressions visit -
    https://docs.python.org/3/howto/regex.html
    """

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

df['cleaned_summary'] = df.Review.apply(cleaner)
df = df[df['cleaned_summary'].map(len) > 0] # removing rows with cleaned summaries of length 0
print("Printing top 5 rows of dataframe showing original and cleaned summaries....")
print(df[['Review','cleaned_summary']].head())
df['cleaned_summary'] = [" ".join(row) for row in df['cleaned_summary'].values] # joining tokens to create strings. TfidfVectorizer does not accept tokens as input
data = df['cleaned_summary']
Y = df['Recommend'] # target column
tfidf = TfidfVectorizer(min_df=.0005, ngram_range=(1,3)) # min_df=.0005 means that each ngram (unigram, bigram, & trigram) must be present in at least 30 documents for it to be considered as a token (60000*.0005=30). This is a clever way of feature engineering
tfidf.fit(data) # learn vocabulary of entire data
data_tfidf = tfidf.transform(data) # creating tfidf values
print("The created tokens: \n", tfidf.get_feature_names_out())
print("Shape of tfidf matrix: ", data_tfidf.shape)


  soup = BeautifulSoup(summary, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'


Printing top 5 rows of dataframe showing original and cleaned summaries....
                                              Review  \
0  Great music service, the audio is high quality...   
1  Please ignore previous negative rating. This a...   
2    Really buggy and terrible to use as of recently   
3  Dear Spotify why do I get songs that I didn't ...   
4  I love the selection and the lyrics are provid...   

                                     cleaned_summary  
0  [great, music, service, audio, high, quality, ...  
1  [please, ignore, previous, negative, rating, a...  
2           [really, buggy, terrible, use, recently]  
3  [dear, spotify, get, song, put, playlist, shuf...  
4  [love, selection, lyric, provided, song, liste...  
The created tokens: 
 ['aap' 'ability' 'ability play' ... 'yt music' 'zero' 'zero star']
Shape of tfidf matrix:  (35294, 5058)


In [None]:
!pip install umap-learn



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.graph_objs as go
import plotly.figure_factory as ff
import umap # use 'pip install umap-learn' or 'conda install -c conda-forge umap-learn'


In [None]:
# # Implementing UMAP to visualize dataset
# import umap
# u = umap.UMAP(n_components = 2, n_neighbors=15, min_dist=0.1)
# x_umap = u.fit_transform(data_tfidf)
# digits=list(df['Category'])


In [None]:
# Implementing UMAP to visualize dataset
u = umap.UMAP(n_components = 2, n_neighbors=15, min_dist=0.4)
x_umap = u.fit_transform(data_tfidf)

data = [go.Scatter(x=x_umap[:,0], y=x_umap[:,1], mode='markers',
                    marker = dict(color=df['Recommend'], colorscale='Rainbow', opacity=0.5),
                                text=[f'digit: {a}' for a in list(df['Recommend'])],
                                hoverinfo='text')]

layout = go.Layout(title = 'UMAP Dimensionality Reduction', width = 700, height = 700,
                    xaxis = dict(title='First Dimension'),
                    yaxis = dict(title='Second Dimension'))
fig = go.Figure(data=data, layout=layout)
fig.show()
