In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from sklearn.metrics import *
import warnings
import string
import time
import nltk
from sklearn.ensemble import RandomForestRegressor


/kaggle/input/spotify-app-reviews-2022/reviews.csv


In [2]:
data = pd.read_csv("../input/spotify-app-reviews-2022/reviews.csv")
data

Unnamed: 0,Time_submitted,Review,Rating,Total_thumbsup,Reply
0,2022-07-09 15:00:00,"Great music service, the audio is high quality...",5,2,
1,2022-07-09 14:21:22,Please ignore previous negative rating. This a...,5,1,
2,2022-07-09 13:27:32,"This pop-up ""Get the best Spotify experience o...",4,0,
3,2022-07-09 13:26:45,Really buggy and terrible to use as of recently,1,1,
4,2022-07-09 13:20:49,Dear Spotify why do I get songs that I didn't ...,1,1,
...,...,...,...,...,...
61589,2022-01-01 03:01:29,Even though it was communicated that lyrics fe...,1,6,
61590,2022-01-01 02:13:40,"Use to be sooo good back when I had it, and wh...",1,0,
61591,2022-01-01 01:02:29,This app would be good if not for it taking ov...,2,10,
61592,2022-01-01 00:49:23,The app is good hard to navigate and won't jus...,2,1,


In [3]:
data.Rating.value_counts()

5    22095
1    17653
4     7842
2     7118
3     6886
Name: Rating, dtype: int64

In [4]:
data.Review.str.split().apply(lambda x: len(x)).sort_values()

24628      2
30670      2
7083       3
11001      3
5731       3
        ... 
40973    185
26055    219
46004    222
20489    279
40566    699
Name: Review, Length: 61594, dtype: int64

In [5]:
# Sample text
sample_text = data.Review[5]
sample_text

'The player controls sometimes disappear for no reason. App restart forgets what I was playing but fixes the issue.'

In [6]:
# tokenaizer
word_tokenaizer = nltk.WordPunctTokenizer()

sample_text = sample_text.lower()

sample_text_lst = word_tokenaizer.tokenize(sample_text)

sample_text_lst

['the',
 'player',
 'controls',
 'sometimes',
 'disappear',
 'for',
 'no',
 'reason',
 '.',
 'app',
 'restart',
 'forgets',
 'what',
 'i',
 'was',
 'playing',
 'but',
 'fixes',
 'the',
 'issue',
 '.']

In [7]:
# stop words
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
print(len(stop_words))
stop_words[:10]

179


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [8]:
import re

# пунктуация. 
article_word_lst = [word for word in sample_text_lst if re.match(r'[a-zA-Z]+', word)]
article_word_lst

['the',
 'player',
 'controls',
 'sometimes',
 'disappear',
 'for',
 'no',
 'reason',
 'app',
 'restart',
 'forgets',
 'what',
 'i',
 'was',
 'playing',
 'but',
 'fixes',
 'the',
 'issue']

In [9]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


True

In [10]:
from nltk.stem.wordnet import WordNetLemmatizer

wnl = WordNetLemmatizer()

for word in article_word_lst:
    print(wnl.lemmatize(word))

the
player
control
sometimes
disappear
for
no
reason
app
restart
forgets
what
i
wa
playing
but
fix
the
issue


In [11]:
# full pipeline

def text_processing(text):
    
    # нижний регистр
    text = text.lower()
    
    # токенизация
    word_list = word_tokenaizer.tokenize(text)
    
    #убираем, всё кроме букв и цифр.
    word_list = [word for word in word_list if re.match(r'[a-zA-Z]+', word)]
    
    #убираем стоп слова
    word_list = [word for word in word_list if word not in stop_words]
    
    # леммезация
    word_list = [wnl.lemmatize(word) for word in word_list if word not in stop_words]
    
    out_text = ''
    for word in word_list:
        out_text += word + " " 
    
    return out_text

print(text_processing(data.Review[7]))

still extremely slow changing storage external sd card convinced done purpose spotify know issue done nothing solve time changed sd card faster read write speed samsung brand please add like song never appear search playlist 


In [12]:
data["lemmed_text"] = data.Review.apply(lambda a: text_processing(a))

**Split**

In [13]:
from sklearn.model_selection import train_test_split
X = data["lemmed_text"]
y = data["Rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)

(49275,) (12319,)


# tf-idf + Linear Model

In [14]:
#Вычисляем tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(max_features = 2000, norm=None)
vect.fit(X_train)

vect.get_feature_names()[:10]



['aap',
 'ability',
 'able',
 'absolute',
 'absolutely',
 'absurd',
 'acc',
 'accept',
 'acceptable',
 'access']

In [15]:
train_texts_vect = vect.fit_transform(X_train)
test_texts_vect = vect.transform(X_test)

In [16]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import GridSearchCV
from sklearn import linear_model

model = linear_model.Lasso(alpha=0.1)

parameters = {'alpha': [0.0005, 0.001, 0.003, 0.005]}

lmr = GridSearchCV(model, parameters, cv=3, scoring='neg_root_mean_squared_error')

lmr.fit(train_texts_vect, y_train)

print(lmr.best_params_, lmr.best_score_)

{'alpha': 0.003} -1.174693964488306


In [17]:
pred_train = lmr.predict(train_texts_vect)
pred_test = lmr.predict(test_texts_vect)


print("Train RMSE:", mean_squared_error(pred_train, y_train, squared=False), "Train MAE:", mean_absolute_error(pred_train, y_train))
print("Test RMSE:", mean_squared_error(pred_test, y_test, squared=False), "Test MAE:", mean_absolute_error(pred_test, y_test))

Train RMSE: 1.151951673623144 Train MAE: 0.9526137679749629
Test RMSE: 1.1695083566347775 Test MAE: 0.9717219550305707


In [18]:
word_score = pd.DataFrame(vect.get_feature_names(), columns=['word'])
word_score['coef'] = lmr.best_estimator_.coef_
word_score.sort_values("coef", ascending=False)



Unnamed: 0,word,coef
174,best,0.224871
1025,love,0.214433
743,great,0.183107
69,amazing,0.154244
138,awesome,0.148019
...,...,...
1760,terrible,-0.109746
983,lisa,-0.113654
1814,trash,-0.115710
1091,misinformation,-0.152887


# Word2Vec

In [19]:
from gensim.models import Word2Vec
#model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)

In [20]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


**Word2vec + RandomForest**

In [21]:
def buildWordVector(tokens, size, w2v_model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    
    for word in tokens:
        #print(word)
        try:
            vec += w2v_model[word].reshape((1, size)) #* tfidf[word]  vect
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [22]:
train_vecs_w2v = np.concatenate([buildWordVector(z, 100, glove_vectors) for z in [text.split(" ") for text in X_train.values]])

test_vecs_w2v = np.concatenate([buildWordVector(z, 100, glove_vectors) for z in [text.split(" ") for text in X_test.values]])

In [23]:
'''
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
parameters = {'n_estimators': [15,30,50]}
rfg = GridSearchCV(model, parameters, cv=2, scoring='neg_root_mean_squared_error', verbose=3)
rfg.fit(train_vecs_w2v, y_train)
print(rfg.best_params_, rfg.best_score_)
'''

"\nfrom sklearn.ensemble import RandomForestRegressor\nmodel = RandomForestRegressor()\nparameters = {'n_estimators': [15,30,50]}\nrfg = GridSearchCV(model, parameters, cv=2, scoring='neg_root_mean_squared_error', verbose=3)\nrfg.fit(train_vecs_w2v, y_train)\nprint(rfg.best_params_, rfg.best_score_)\n"

In [24]:
rfg = RandomForestRegressor(n_estimators = 50)

rfg.fit(train_vecs_w2v, y_train)

pred_train = rfg.predict(train_vecs_w2v)
pred_test = rfg.predict(test_vecs_w2v)


print("Train RMSE:", mean_squared_error(pred_train, y_train, squared=False), "Train MAE:", mean_absolute_error(pred_train, y_train))
print("Test RMSE:", mean_squared_error(pred_test, y_test, squared=False), "Test MAE:", mean_absolute_error(pred_test, y_test))

Train RMSE: 0.4745515548799043 Train MAE: 0.37326727190723585
Test RMSE: 1.2228003477562537 Test MAE: 0.9945488899325886


**Word2vec + tf-idf + RandomForest**

In [25]:
def buildWordVectorTFIDF(tokens, size, w2v_model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    
    for word in tokens:
        #print(word)
        try:
            vec += w2v_model[word].reshape((1, size)) * vect.vocabulary_[word]  
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [26]:
train_vecs_w2v_tfidf = np.concatenate([buildWordVectorTFIDF(z, 100, glove_vectors) for z in [text.split(" ") for text in X_train.values]])

test_vecs_w2v_tfidf = np.concatenate([buildWordVectorTFIDF(z, 100, glove_vectors) for z in [text.split(" ") for text in X_test.values]])

In [27]:
rfg = RandomForestRegressor(n_estimators = 50)

rfg.fit(train_vecs_w2v_tfidf, y_train)

pred_train = rfg.predict(train_vecs_w2v_tfidf)
pred_test = rfg.predict(test_vecs_w2v_tfidf)


print("Train RMSE:", mean_squared_error(pred_train, y_train, squared=False), "Train MAE:", mean_absolute_error(pred_train, y_train))
print("Test RMSE:", mean_squared_error(pred_test, y_test, squared=False), "Test MAE:", mean_absolute_error(pred_test, y_test))

Train RMSE: 0.4930068216201712 Train MAE: 0.3939987463938248
Test RMSE: 1.2681400374348575 Test MAE: 1.0476850704680447
