In [1]:
import pandas as pd

reviews = pd.read_csv("reviews-albums.csv", sep=";")
reviews.head()

Unnamed: 0,artist,title,text,genre,year,author,score
0,Xenia Rubinos,Una Rosa,Best new music The singer-songwriter’s latest...,Rock,2021,Contributor,8.4
1,Lana Del Rey,Blue Banisters,Lana Del Rey’s second album of the year is a ...,Pop/R&B,2021,Associate Editor,7.7
2,Black Marble,Fast Idol,Chris Stewart continues his decade-long quest...,Rock,2021,Contributor,6.8
3,UNIIQU3,Heartbeats,The breakout star of the regional subgenre kn...,Electronic,2021,Contributor,7.8
4,Dos Santos,City of Mirrors,Approaching the rich Latinx musical tradition...,Rock,2021,Contributor,7.4


In [2]:
reviews.dtypes

artist     object
title      object
text       object
genre      object
year       object
author     object
score     float64
dtype: object

## Engenharia de atributos

* Remover discos sem ano de lançamento
* Reduzir número de classes dos autores
* Aplicar One-Hot Encoding em variáveis categóricas

In [3]:
def check_nulls(df):
    rows = list()

    for column in df.columns:
        row = {
            'Coluna': column, 
            'NANs (absoluto)': df[column].isnull().sum(), 
            'NANs (porcentagem)': df[column].isnull().sum() / df.shape[0]
        }

        rows.append(row)

    res = pd.DataFrame(rows)

    return res[res["NANs (absoluto)"] > 0].sort_values('NANs (absoluto)', ascending=False)

check_nulls(reviews)

Unnamed: 0,Coluna,NANs (absoluto),NANs (porcentagem)


In [4]:
reviews["year"].unique()

array(['2021', '1991', '1996', '2010', '1969', '1997', '1993', '1981',
       '1984', '1995', '1992', '2001', '2016', '1977', '1987', '1999',
       '1979', '1976', '2014', '1994', '2011', '2020', '1986', '1982',
       '1980', '1988', '1978', '2002', '1983', '1971', '2004', '1966',
       '2000', '1985', '1998', '1990', '2019', '2003', '1974', '2007',
       ' • ', '1972', '1973', '1975', '2009', '1962', '2012', '2008',
       '2006', '2013', '1989', '2018', '1970', '1968', '1957', '2017',
       '1967', '2015', '1965'], dtype=object)

In [5]:
# Remove reviews without 
reviews_year = reviews.drop(reviews[reviews.eq(' • ').any(1)].index)

# Convert year to int
reviews_year["year"] = pd.to_numeric(reviews_year["year"])

reviews_year["year"].unique()

array([2021, 1991, 1996, 2010, 1969, 1997, 1993, 1981, 1984, 1995, 1992,
       2001, 2016, 1977, 1987, 1999, 1979, 1976, 2014, 1994, 2011, 2020,
       1986, 1982, 1980, 1988, 1978, 2002, 1983, 1971, 2004, 1966, 2000,
       1985, 1998, 1990, 2019, 2003, 1974, 2007, 1972, 1973, 1975, 2009,
       1962, 2012, 2008, 2006, 2013, 1989, 2018, 1970, 1968, 1957, 2017,
       1967, 2015, 1965])

In [6]:
reviews_year["author"].unique()

array(['Contributor', 'Associate Editor', 'Contributor ',
       'Contributing Editor', 'Assistant Editor', 'Staff Writer',
       'Features Editor', 'Staff Writer ', 'Associate Staff Writer, News',
       'Executive Editorial Assistant', 'Editorial Producer',
       'News Editor', 'Editorial Fellow', 'Reviews Editor',
       'Senior Staff Writer', 'Associate Staff Writer',
       'Contributing Writer', 'Senior Editor'], dtype=object)

In [7]:
reviews_author = reviews_year.copy()

reviews_author.loc[(reviews_author['author'].str.contains("contrib", case=False)), "author"] = "Contributor"
reviews_author.loc[(reviews_author['author'].str.contains("associate", case=False)), "author"] = "Associate"
reviews_author.loc[(reviews_author['author'].str.contains("editor", case=False)), "author"] = "Editor"
reviews_author.loc[(reviews_author['author'].str.contains("staff", case=False)), "author"] = "Staff"

reviews_author["author"].unique()

array(['Contributor', 'Associate', 'Editor', 'Staff'], dtype=object)

In [8]:
reviews_author["genre"].unique()

array(['Rock', 'Pop/R&B', 'Electronic', 'Rap', 'Metal', 'Experimental',
       'Folk/Country', 'Jazz', 'Global'], dtype=object)

In [9]:
# Aplica o one-hot nos dados que ainda são textuais
category_columns = reviews_author.select_dtypes("object").columns.tolist()
category_columns.remove("text")
category_columns.remove("title")
category_columns.remove("artist")

reviews_one_hot = pd.get_dummies(reviews_author, columns=category_columns)

reviews_one_hot.columns

Index(['artist', 'title', 'text', 'year', 'score', 'genre_Electronic',
       'genre_Experimental', 'genre_Folk/Country', 'genre_Global',
       'genre_Jazz', 'genre_Metal', 'genre_Pop/R&B', 'genre_Rap', 'genre_Rock',
       'author_Associate', 'author_Contributor', 'author_Editor',
       'author_Staff'],
      dtype='object')

## Analise de sentimento

* Aplicar diferentes algoritmos de análise de sentimento no conteúdo do texto
* Utilizar o resultado dessas análises como atributos para prever a nota

### Natural Language Toolkit

In [10]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/arthur/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
compound = list()
reviews_nltk = reviews_one_hot.copy()

for text in reviews_nltk["text"].values:
    compound.append(sid.polarity_scores(text)["compound"])
    
reviews_nltk["nltk"] = compound
reviews_nltk[["nltk", "score"]].head()

Unnamed: 0,nltk,score
0,0.9556,8.4
1,0.999,7.7
2,0.9973,6.8
3,0.9838,7.8
4,0.9914,7.4


### TextBlob

In [12]:
from textblob import TextBlob

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/arthur/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
polarity = list()
reviews_textblob = reviews_nltk.copy()

for text in reviews_textblob["text"].values:
    polarity.append(TextBlob(text).sentiment.polarity)

reviews_textblob["text_blob"] = polarity
reviews_textblob[["text_blob", "score"]].head()

Unnamed: 0,text_blob,score
0,0.104145,8.4
1,0.140743,7.7
2,0.182267,6.8
3,0.068571,7.8
4,0.143653,7.4


### Flair

In [14]:
import flair

flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

2021-10-24 18:08:23,528 loading file /home/arthur/.flair/models/sentiment-en-mix-distillbert_4.pt


In [15]:
confidences =  list()
reviews_flair = reviews_textblob.copy()

for text in reviews_flair["text"].values:
    sentence = flair.data.Sentence(text)
    flair_sentiment.predict(sentence)
    sentiment = sentence.labels[0].to_dict()

    label = sentiment["value"]

    # Negative label is interpretade as negative value
    confidence = sentiment["confidence"] if label == "POSITIVE" else sentiment["confidence"] * -1

    confidences.append(confidence)

reviews_flair["flair"] = confidences
reviews_flair[["flair", "score"]].head()

Unnamed: 0,flair,score
0,0.959007,8.4
1,0.895744,7.7
2,0.896525,6.8
3,0.991742,7.8
4,0.999932,7.4


In [16]:
reviews_flair.to_csv("reviews-nlp.csv", index=False, sep=";")

In [17]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

processed_reviews = pd.read_csv("reviews-nlp.csv", sep=";")

processed_reviews.head()

Unnamed: 0,artist,title,text,year,score,genre_Electronic,genre_Experimental,genre_Folk/Country,genre_Global,genre_Jazz,...,genre_Pop/R&B,genre_Rap,genre_Rock,author_Associate,author_Contributor,author_Editor,author_Staff,nltk,text_blob,flair
0,Xenia Rubinos,Una Rosa,Best new music The singer-songwriter’s latest...,2021,8.4,0,0,0,0,0,...,0,0,1,0,1,0,0,0.9556,0.104145,0.959007
1,Lana Del Rey,Blue Banisters,Lana Del Rey’s second album of the year is a ...,2021,7.7,0,0,0,0,0,...,1,0,0,1,0,0,0,0.999,0.140743,0.895744
2,Black Marble,Fast Idol,Chris Stewart continues his decade-long quest...,2021,6.8,0,0,0,0,0,...,0,0,1,0,1,0,0,0.9973,0.182267,0.896525
3,UNIIQU3,Heartbeats,The breakout star of the regional subgenre kn...,2021,7.8,1,0,0,0,0,...,0,0,0,0,1,0,0,0.9838,0.068571,0.991742
4,Dos Santos,City of Mirrors,Approaching the rich Latinx musical tradition...,2021,7.4,0,0,0,0,0,...,0,0,1,0,1,0,0,0.9914,0.143653,0.999932


In [21]:
X = processed_reviews.drop(["text", "score", "artist", "title"], axis=1)
y = processed_reviews["score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
X_train.head()

Unnamed: 0,year,genre_Electronic,genre_Experimental,genre_Folk/Country,genre_Global,genre_Jazz,genre_Metal,genre_Pop/R&B,genre_Rap,genre_Rock,author_Associate,author_Contributor,author_Editor,author_Staff,nltk,text_blob,flair
3670,2018,1,0,0,0,0,0,0,0,0,0,1,0,0,0.9932,0.147222,0.99997
4888,2017,0,0,0,0,0,0,0,0,1,0,1,0,0,0.9828,0.06833,-0.998078
4643,2017,0,0,0,0,0,0,1,0,0,0,1,0,0,0.9976,0.184371,0.998743
437,2021,1,0,0,0,0,0,0,0,0,0,1,0,0,-0.8473,0.025312,-0.541597
518,2021,0,0,0,0,0,0,0,0,1,0,1,0,0,0.9919,0.083724,0.996192


In [22]:
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)
y_pred = tree_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred, squared=False)
mse

1.0575035126396901

In [23]:
from sklearn.ensemble import RandomForestRegressor

rfce_model = RandomForestRegressor(random_state=42, n_estimators = 100)
rfce_model.fit(X_train, y_train)

rfce_y_pred = rfce_model.predict(X_test)
rfce_mse = mean_squared_error(y_test, rfce_y_pred, squared=False)

rfce_mse

0.7675083729799561

In [None]:
# for text, score in zip(reviews_one_hot["text"].values, reviews_one_hot["score"].values):
#     print(sid.polarity_scores(text)["compound"], score)
    
    # comp = list()

    # for sentence in text.split('.'):
    #     comp.append(float(sid.polarity_scores(sentence)["compound"]))
    
    # print(f"{sum(comp) / len(comp):.3f} {score}\n")
    
    # break

In [None]:
text = reviews[reviews.eq(1.6).any(1)]["text"].values[0]

# for sentence in text.split('.'):
#     print(sentence)
#     print(sid.polarity_scores(sentence))
#     print(TextBlob(sentence).sentiment)
#     print()

s = flair.data.Sentence(text)
flair_sentiment.predict(s)
total_sentiment = s.labels
total_sentiment

In [None]:
text = reviews[reviews.eq(10.0).any(1)]["text"].values[0]

# for sentence in text.split('.'):
#     print(sentence)
#     print(sid.polarity_scores(sentence))
#     print(TextBlob(sentence).sentiment)
#     print()

s = flair.data.Sentence(text)
flair_sentiment.predict(s)
total_sentiment = s.labels
total_sentiment