In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 5.1 MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993242 sha256=6d9852e1856b455344e15549cc98ba8292b5a105f00da162a8a6675a0dbafd56
  Stored in directory: /root/.cache/pip/wheels/c5/96/8a/f90c59ed25d75e50a8c10a1b1c2d4c402e4dacfa87f3aff36a
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 37.8 gigabytes of available RAM



In [6]:
#Runtime(tab) > Change runtime type in the menu to enable a GPU accelerator, and then re-execute the code cell.
tpu_info = !nvidia-smi
tpu_info = '\n'.join(tpu_info)
if tpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(tpu_info)

Not connected to a GPU


In [7]:
#import libraries
import pandas as pd
import numpy as np
import spacy
import re
import langdetect

In [8]:
#keras modules
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Dropout, Embedding, Bidirectional, SimpleRNN
from keras.utils.np_utils import to_categorical

In [9]:
#gensin modules
from gensim.models import word2vec

In [10]:
#sci-kit learn module
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [11]:
path = "/content/drive/MyDrive/Colab Notebooks/proj_music/music_album_reviews.csv"
music = pd.read_csv(path)
#change the name of the columns
music = music.set_axis(["review", "rating"], axis=1)
print(music.head(2))
print(len(music.index))

                                              review  rating
0  i think i actually under-rate ok computer if a...     5.0
1  i get why radiohead rub a lot of people the wr...     5.0
80271


In [12]:
#drop all the rows where at least one element is missing
music = music.dropna()
print(music.head(2))
print(len(music.index))

                                              review  rating
0  i think i actually under-rate ok computer if a...     5.0
1  i get why radiohead rub a lot of people the wr...     5.0
78162


In [13]:
#drop all the rows where there are only numbers(floats) in the reviews, reviews must be strings
music = music[~music['review'].str.isnumeric()]
print(music.head())
print(len(music.index))

                                              review  rating
0  i think i actually under-rate ok computer if a...     5.0
1  i get why radiohead rub a lot of people the wr...     5.0
2  i would like to think i am good about not lett...     4.5
3  there are radiohead devotees like there were o...     4.0
4  i wrote a shining excellent review for this al...     5.0
78152


In [14]:
print(music.dtypes)
calif_uni = set(music['rating'])
print(calif_uni)
print(np.mean(music['rating']))

review     object
rating    float64
dtype: object
{0.5, 1.5, 2.0, 2.5, 4.0, 4.5, 5.0, 3.0, 3.5, 1.0}
4.2531988944620736


In [15]:
#pythonic way calculate the number of words in each sentence
def counting_words(sentence):
  return len(sentence.split())

lista_lens = list(map(counting_words, music['review']))

print(sorted(lista_lens[0:5], reverse=True))
print(np.mean(lista_lens[0:5]))

[1478, 998, 813, 505, 438]
846.4


In [16]:
#This function turn the sentence into lower caps, eliminate special characters and numbers an only one space between words
def clean_sentence(sentence):
  sentence = sentence.lower() #to lower
  sentence = re.sub(pattern = '[^\w\s]', repl = ' ', string = sentence)
  sentence = sentence.split() #to list
  sentence = ' '.join(word for word in sentence if word.isalpha())#clean numbers and special characters
  return sentence

sent = "THis is an ExamplE of the POWER, OF /&(&/()%$$%$#&, culiamera... 'fantochex'    ----    when tolo%&%meo"
print(clean_sentence(sent))

this is an example of the power of culiamera fantochex when tolo meo


In [17]:
#map the clean sentence function to the column review
cleaned_reviews = music['review'].map(clean_sentence)
music['review'] = cleaned_reviews
print(music.head())

                                              review  rating
0  i think i actually under rate ok computer if a...     5.0
1  i get why radiohead rub a lot of people the wr...     5.0
2  i would like to think i am good about not lett...     4.5
3  there are radiohead devotees like there were o...     4.0
4  i wrote a shining excellent review for this al...     5.0


In [18]:
#create and add a column to df with the number of words by mapping the function counting words
n_words = list(map(counting_words, music['review']))
music['n_words'] = n_words
print(music.head())

                                              review  rating  n_words
0  i think i actually under rate ok computer if a...     5.0      507
1  i get why radiohead rub a lot of people the wr...     5.0     1004
2  i would like to think i am good about not lett...     4.5     1492
3  there are radiohead devotees like there were o...     4.0      439
4  i wrote a shining excellent review for this al...     5.0      806


In [19]:
print(min(music['n_words']))
print(max(music['n_words']))
print(np.mean(music['n_words']))

0
5953
164.39128877060088


In [20]:
#drop rows that meet certain condition, inplace True to not overwrite the df explicitly
music.drop(music[music['n_words'] <= 5].index, inplace = True)
print(min(music['n_words']))
print(max(music['n_words']))
print(np.mean(music['n_words']))

6
5953
169.45183298582702


In [21]:
print(music.dtypes)
print(type(music['review'][15]))

review      object
rating     float64
n_words      int64
dtype: object
<class 'str'>


In [22]:
#map the detect fuction for language detections column review make a list of languages
detect_lang = music['review'].map(langdetect.detect)
music['detect_lang'] = detect_lang

In [23]:
print(music.head(1))
print(type(music['detect_lang'][0]))

                                              review  rating  n_words  \
0  i think i actually under rate ok computer if a...     5.0      507   

  detect_lang  
0          en  
<class 'str'>


In [24]:
#drop rows that meet certain condition, inplace True to not overwrite the df explicitly 
music.drop(music[music['detect_lang'] != 'en'].index, inplace = True)

In [25]:
#reset the index, doesn't create a new column of indeces, does create a new copy by using inplace = True
music.reset_index(drop=True ,inplace=True)

In [26]:
print(len(music.index))

69409


In [31]:
#Important parameters
max_len_sentences = 600 #number of words in each of the sentences of matrix X
window_words = 10 #window of words for context
vec_size_words = 30 #number of elements in each word vectors

In [32]:
#corpus for word2vec -- corpus tokenized the result is a list of lists
corpus_token = []
for index in range(len(music['review'])):
  corpus_token.append(music['review'][index].split(' '))
print(corpus_token[:2])

[['i', 'think', 'i', 'actually', 'under', 'rate', 'ok', 'computer', 'if', 'anything', 'that', 'is', 'bonkers', 'right', 'it', 'is', 'been', 'my', 'favourite', 'album', 'for', 'over', 'two', 'decades', 'now', 'but', 'that', 'right', 'there', 'is', 'exactly', 'the', 'problem', 'i', 'heard', 'it', 'when', 'i', 'was', 'and', 'it', 'was', 'maybe', 'the', 'fourth', 'or', 'fifth', 'album', 'of', 'any', 'kind', 'i', 'would', 'ever', 'truly', 'paid', 'attention', 'to', 'front', 'to', 'back', 'and', 'the', 'first', 'that', 'was', 'not', 'essentially', 'forced', 'upon', 'me', 'by', 'my', 'parents', 'or', 'general', 'peer', 'pressure', 'at', 'school', 'it', 'was', 'the', 'first', 'time', 'i', 'would', 'ever', 'decided', 'it', 'might', 'be', 'worth', 'actually', 'trying', 'out', 'music', 'for', 'real', 'and', 'the', 'very', 'first', 'album', 'i', 'ever', 'landed', 'on', 'just', 'so', 'happened', 'to', 'be', 'the', 'best', 'one', 'nah', 'it', 'is', 'far', 'too', 'convenient', 'i', 'keep', 'side', 'e

In [33]:
#word2vec running w2v model to create word vectors, each word is going to be represented as a vector
w2v_model = word2vec.Word2Vec(corpus_token, min_count=1, size = vec_size_words, window = window_words, iter = 50)
w2v_model.wv.most_similar(['favourite'], topn=3)



[('favorite', 0.9962131977081299),
 ('fave', 0.9310210347175598),
 ('fav', 0.8619124293327332)]

In [34]:
#Find the closest words to a word reference according to the cosine similarity of the w2v model 
try:
  print(w2v_model.wv.most_similar(['beautiful'], topn=3))
except:
  print("word not in the corpus")

[('gorgeous', 0.9342557191848755), ('haunting', 0.8774575591087341), ('uplifting', 0.8731911778450012)]


In [35]:
#Find the closest words to a word reference according to the cosine similarity of the w2v model
try:
  print(w2v_model.wv.most_similar(['power'], topn=3))
except:
  print("word not in the corpus")

[('energy', 0.7943024635314941), ('intensity', 0.7607317566871643), ('fury', 0.7553068399429321)]


In [36]:
#operation with words
diff = w2v_model.wv['song'] - w2v_model.wv['lyrics']
w2v_model.wv.most_similar(positive=[diff], topn=3)

[('track', 0.6699579954147339),
 ('note', 0.6582359075546265),
 ('summermoroderbellottes', 0.6379653215408325)]

In [37]:
#corpus token, max length of sentences in each review
corpus_max_len = []

#create the data set with the word vectors instead of words
for lista in corpus_token:
  lista_lim = []
  for jindex, word in enumerate(lista):
    if jindex < max_len_sentences:#maximum number of words in the sentence
      lista_lim.append(word)
    else:
      continue
  corpus_max_len.append(lista_lim)#this list of lists again

In [38]:
lenghts = []
for lista in corpus_max_len:
  lenghts.append(len(lista))

print(max(lenghts))

600


In [60]:
def word_into_vector(word):
  vector_word = w2v_model.wv['word'].tolist()
  return vector_word

corpus_vec = []
#create the data set with the word vectors instead of words
for lista in corpus_max_len:
  word_vec = list(map(word_into_vector, lista))
  corpus_vec.append(word_vec)

In [61]:
print(len(corpus_vec))
print(len(corpus_vec[15]))
print(len(corpus_vec[0][0]))
print(type(corpus_vec))
print(type(corpus_vec[0]))
print(type(corpus_vec[0][0]))

69409
600
30
<class 'list'>
<class 'list'>
<class 'list'>


In [62]:
#padding
vector_pad = [0] * 30 #vector of zeros to pad when there is no word
corpus_vec_pad = []
for review in corpus_vec:
  len_pad = max_len_sentences - len(review)
  for kindex in range(len_pad):
    review.append(vector_pad)
  corpus_vec_pad.append(review)

In [68]:
print(len(corpus_vec_pad))
print(len(corpus_vec_pad[65000]))
print(len(corpus_vec_pad[1000][0]))

69409
600
30


In [79]:
#create labels according to the rating given to each review
def create_labels(rating):
  if rating <= 2:
    return 0
  elif rating <= 3.5:
    return 1
  else:
    return 2

In [80]:
#create a column with the labels
music['label'] = music['rating'].map(create_labels)
print(music.loc[89])

review         what other album inspires so many unsubstantia...
rating                                                       3.5
n_words                                                      424
detect_lang                                                   en
label                                                          1
Name: 89, dtype: object


In [81]:
a = to_categorical([0, 1, 2, 3,3,2,1,0], num_classes=4)

In [82]:
#Create X matrix and Y labels
Y = to_categorical(music['label'], num_classes=3)
X = np.array(corpus_vec_pad)

In [85]:
print(Y.shape)
print(X.shape)

(69409, 3)
(69409, 600, 30)


In [None]:
#train - test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=1, train_size = .75)

In [None]:
#Build model LSTM, dense layer has 3 nodes one for each class
model= Sequential()
model.add(LSTM(units = 128))
model.add(Dense(units=3, activation='softmax'))

In [None]:
#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#train the model
model.fit(x_train, y_train, epochs = 20)

In [None]:
#Evaluate model 1 shows the loss and accuracy
score, acc = model.evaluate(x_test, y_test, verbose = 2)
print('Test score: ', score)
print('Test accuracy: ', acc)

In [None]:
#Number of parameters in the model
print(model.summary())