In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from nltk import download
from nltk import word_tokenize

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

import tensorflow as tf
from keras import layers


In [3]:
download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
df = pd.read_pickle('/content/drive/My Drive/dataset/p4k_reviews_dataset_processed.pkl')

df.head()

Unnamed: 0,url,score,pub_year,text
0,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,2017,"“Trip-hop” eventually became a ’90s punchline,..."
1,http://pitchfork.com/reviews/albums/22721-prel...,7.9,2017,"Eight years, five albums, and two EPs in, the ..."
2,http://pitchfork.com/reviews/albums/22659-all-...,7.3,2017,Minneapolis’ Uranium Club seem to revel in bei...
3,http://pitchfork.com/reviews/albums/22661-firs...,9.0,2017,Kleenex began with a crash. It transpired one ...
4,http://pitchfork.com/reviews/albums/22725-new-...,8.1,2017,It is impossible to consider a given release b...


In [9]:
nMax = 60

df = df.groupby('score').apply(lambda x: x.sample(n=min(nMax, len(x))))
pd.set_option('display.max_rows', 500)

In [11]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,url,score,pub_year,text
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,15899,http://pitchfork.com/reviews/albums/6255-liz-p...,0.0,2003,It could be said that Liz Phair's greatest ass...
0.0,11067,http://pitchfork.com/reviews/albums/10571-this...,0.0,2007,"""Indie's Biggest Hits""? ""Volume 1""? ""NOW that'..."
0.0,14537,http://pitchfork.com/reviews/albums/5607-travi...,0.0,2004,Travis Morrison got his ass kicked. He tells t...
0.0,13899,http://pitchfork.com/reviews/albums/6383-relax...,0.0,2005,If more drunks would learn from Robert Pollard...
0.1,17330,http://pitchfork.com/reviews/albums/6516-feel-...,0.1,2002,"Contrary to what you may believe, it's hard to..."
...,...,...,...,...,...
10.0,8627,http://pitchfork.com/reviews/albums/13436-magi...,10.0,2009,"After the death of manager Brian Epstein, the ..."
10.0,4699,http://pitchfork.com/reviews/albums/17497-illm...,10.0,2013,You hear the arthritic rumble of the train. Th...
10.0,862,http://pitchfork.com/reviews/albums/21866-dirt...,10.0,2016,"At the dawn of the 1980s, young black musician..."
10.0,15259,http://pitchfork.com/reviews/albums/1969-no-th...,10.0,2004,"""At its best new wave/punk represents a fundam..."


In [12]:
data = df['text'].map(word_tokenize).values
total_vocabulary = set(word.lower() for review in data for word in review)  # set created from nested comprehension
print('There are {} unique words in the dataset.'.format(len(total_vocabulary)))
print('There are {} unique tweets in the dataset.'.format(len(data)))


df.head()

There are 113379 unique words in the dataset.
There are 3869 unique tweets in the dataset.


Unnamed: 0_level_0,Unnamed: 1_level_0,url,score,pub_year,text
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,15899,http://pitchfork.com/reviews/albums/6255-liz-p...,0.0,2003,It could be said that Liz Phair's greatest ass...
0.0,11067,http://pitchfork.com/reviews/albums/10571-this...,0.0,2007,"""Indie's Biggest Hits""? ""Volume 1""? ""NOW that'..."
0.0,14537,http://pitchfork.com/reviews/albums/5607-travi...,0.0,2004,Travis Morrison got his ass kicked. He tells t...
0.0,13899,http://pitchfork.com/reviews/albums/6383-relax...,0.0,2005,If more drunks would learn from Robert Pollard...
0.1,17330,http://pitchfork.com/reviews/albums/6516-feel-...,0.1,2002,"Contrary to what you may believe, it's hard to..."


In [13]:
from keras.utils import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D, CuDNNLSTM
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers, optimizers
from keras.preprocessing import text, sequence

In [14]:
# set the emotion/sentiment as our target
target = df['score']

In [15]:
target

score       
0.0    15899     0.0
       11067     0.0
       14537     0.0
       13899     0.0
0.1    17330     0.1
                ... 
10.0   8627     10.0
       4699     10.0
       862      10.0
       15259    10.0
       5249     10.0
Name: score, Length: 3869, dtype: float64

In [17]:
# use one hot encoding since our target is categorical
y = target
#y = target

In [44]:
y

score       
0.0    15899     0.0
       11067     0.0
       14537     0.0
       13899     0.0
0.1    17330     0.1
                ... 
10.0   8627     10.0
       4699     10.0
       862      10.0
       15259    10.0
       5249     10.0
Name: score, Length: 3869, dtype: float64

In [45]:
# use keras to create a Tokenizer object
tokenizer = text.Tokenizer(num_words=40000)  # limit to the num_words most important ones
tokenizer.fit_on_texts(list(df['text']))
tokenized_texts = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(tokenized_texts, maxlen=200)

In [46]:
X

array([[  178,   157,   203, ...,    37,    53,  1586],
       [ 2793,    13,  2624, ...,    28,     1,  1522],
       [   12, 13502,    24, ...,     7,    13,   286],
       ...,
       [    9,     1,    33, ...,    32,   671,   277],
       [ 2347,   741,    32, ...,    29,  2102,  2776],
       [ 7182,   588,     6, ...,    36, 37215,  1723]], dtype=int32)

In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# create my NN model
model = Sequential()

embedding_size = 128
model.add(Embedding(len(total_vocabulary), embedding_size))
model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True)))
model.add(LSTM(128, return_sequences=True))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dense(1))  # use 5 because we have 5 categories
opt = optimizers.Adam(learning_rate=0.0025)
model.compile(loss='mse',
              optimizer=opt,
              metrics=['mse'])
model.summary() # check the shape

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, None, 128)         14512512  
                                                                 
 bidirectional_7 (Bidirectio  (None, None, 256)        264192    
 nal)                                                            
                                                                 
 lstm_7 (LSTM)               (None, None, 128)         197120    
                                                                 
 global_max_pooling1d_7 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dropout_14 (Dropout)        (None, 128)               0         
                                                                 
 dense_21 (Dense)            (None, 50)               

In [61]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7e95f228a6b0>

In [88]:
from sklearn.metrics import classification_report,accuracy_score
y_pred = model.predict(X_test) # get our predictions
y_pred = [item for sublist in y_pred for item in sublist]



In [89]:
from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("rmse:", rmse)

rmse: 1.247267054201802


In [90]:
mse = mean_squared_error(y_test, y_pred)
print("mse:", mse)

mse: 1.55635896073742


In [91]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, y_pred)
print("mae:", mae)

mae: 0.8878562644963426


In [92]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print("r2:", r2)

r2: 0.800970546026471


In [93]:
from scipy.stats import pearsonr
r = pearsonr(y_test, y_pred)
print(f"pearson's r{r}")

Pearson's r: PearsonRResult(statistic=0.80076991966660395)
