In [24]:
import os 
import json 
import pandas as pd 
import nltk 
import numpy as np 
import string

In [25]:
sample_size = 10000
data = [json.loads(f) for f in open("Digital_Music_5.json", 'r')]
type(data) 
data[1:3] 

[{'reviewerID': 'AZPWAXJG9OJXV',
  'asin': '5555991584',
  'reviewerName': 'bethtexas',
  'helpful': [0, 0],
  'reviewText': "A clasically-styled and introverted album, Memory of Trees is a masterpiece of subtlety.  Many of the songs have an endearing shyness to them - soft piano and a lovely, quiet voice.  But within every introvert is an inferno, and Enya lets that fire explode on a couple of songs that absolutely burst with an expected raw power.If you've never heard Enya before, you might want to start with one of her more popularized works, like Watermark, just to play it safe.  But if you're already a fan, then your collection is not complete without this beautiful work of musical art.",
  'overall': 5.0,
  'summary': 'Enya at her most elegant',
  'unixReviewTime': 991526400,
  'reviewTime': '06 3, 2001'},
 {'reviewerID': 'A38IRL0X2T4DPF',
  'asin': '5555991584',
  'reviewerName': 'bob turnley',
  'helpful': [2, 2],
  'reviewText': "I never thought Enya would reach the sublime he

In [26]:
df = pd.DataFrame(data)

print(df.shape)
df.overall.value_counts(normalize = True)

(64706, 9)


overall
5.0    0.549872
4.0    0.255556
3.0    0.104921
2.0    0.046518
1.0    0.043134
Name: proportion, dtype: float64

In [27]:
df['Weights'] = np.where(df['overall'] < 5, .75, .25)
df['Weights'].unique()

array([0.25, 0.75])

In [28]:
sample = df.sample(n = sample_size, random_state = 10, weights = 'Weights')

sample = sample.dropna(how = 'any', subset = ['reviewText', 'overall'])

In [29]:
sample.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Weights
50925,A1QFQ9SQZVGUK4,B000BGR18W,Brandon L. Harlow,"[4, 23]",Oh why even bother wasting the energy to type....,1.0,well,1133481600,"12 2, 2005",0.75
1566,A2JAKHFYI88ZX0,B000000ORH,"Jake Z ""holden84""","[0, 0]","HEART OF STONE is a fine Cher album, although ...",4.0,80s Cher,1086480000,"06 6, 2004",0.75
43170,A3IPO6P4LHZ0NS,B0000AKCLJ,Joe,"[0, 1]",I just finished listening to this c.d. and I h...,4.0,RAVE ON!!!!!!!!,1067212800,"10 27, 2003",0.75
49625,ANBT9T1QKC662,B0009ML2BU,M. Manzino,"[2, 2]",I really love this CD! it is one of the most d...,5.0,A very different offering,1156982400,"08 31, 2006",0.25
35077,AKFDAV8I4FLUJ,B00005IBYZ,"""jcino""","[0, 0]",After a long time of success they broke up for...,5.0,Punk at it's best and it's saving rock and roll,994982400,"07 13, 2001",0.25


In [30]:
sample.dtypes

reviewerID         object
asin               object
reviewerName       object
helpful            object
reviewText         object
overall           float64
summary            object
unixReviewTime      int64
reviewTime         object
Weights           float64
dtype: object

In [31]:
import re
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    
    text = re.sub(r"&[a-z]+;", " ", text)
    
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    
    text = text.lower()
    
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

sample['cleaned_text'] = sample['reviewText'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()


In [32]:
import spacy
nlp = spacy.load('en_core_web_sm')

sample['tokenized_text'] = [
    [token.text for token in doc] for doc in nlp.pipe(sample['cleaned_text'], batch_size=50, disable=["ner"])
]

In [33]:
sample['filtered_text'] = sample['tokenized_text'].apply(
    lambda tokens: [token for token in tokens if token.lower() not in nlp.Defaults.stop_words]
)

In [34]:
sample['lemmatized_text'] = sample['filtered_text'].apply(
    lambda tokens: [token.lemma_ for token in nlp(' '.join(tokens))]
)

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

sample['lemmatized_text_str'] = sample['lemmatized_text'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer(
    max_features=2000,
    max_df=0.75,
    min_df=1,
    ngram_range=(1, 2)
)

tfidf_matrix = tfidf_vectorizer.fit_transform(sample['lemmatized_text_str'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(tfidf_df.shape)
tfidf_df.head()


(10000, 2000)


Unnamed: 0,ability,able,absolute,absolutely,accessible,accompany,achieve,acoustic,acoustic guitar,act,...,year later,year old,yellow,yes,yo,york,young,young buck,youth,zeppelin
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.152947,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.16331,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(
    sentences=sample['lemmatized_text'],
    vector_size=50,  
    window=7,         
    min_count=5,      
    workers=4         
)

def vectorize_text(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    
    return np.mean(vectors, axis=0)

sample['word2vec_vector'] = sample['lemmatized_text'].apply(lambda x: vectorize_text(x, word2vec_model))

word2vec_df = pd.DataFrame(sample['word2vec_vector'].to_list(), index=sample.index)

print(word2vec_df.shape)
word2vec_df.head()


(10000, 50)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
50925,0.044123,-0.441534,-0.015054,0.321689,0.487549,-0.829639,0.587707,0.795043,-0.338871,-0.723508,...,0.712102,-1.383675,0.045149,-0.007908,1.287121,-0.442357,-0.518548,-0.492448,0.725891,-0.174881
1566,-0.475525,0.16904,0.007491,-0.249216,0.204232,0.203176,0.341003,0.53628,-0.33395,-0.438506,...,0.924538,-0.535002,0.026267,0.317218,0.872393,0.068234,-0.521551,-0.711135,0.415662,0.466742
43170,-0.253319,-0.030798,-0.188804,-0.333921,0.098237,-0.197093,0.319096,0.630244,-0.311729,-0.458982,...,0.881523,-0.315246,0.121155,0.191215,0.878401,-0.122368,-0.300516,-0.536557,0.118423,0.31434
49625,-0.102674,0.114792,-0.056194,-0.298529,0.2325,-0.103922,0.40859,0.720221,-0.325645,-0.316213,...,0.751687,-0.203158,-0.007745,0.217109,0.772784,-0.008614,-0.367184,-0.447807,0.046948,0.306255
35077,-0.256562,0.059517,0.091589,-0.458112,-0.214532,-0.064476,0.20388,0.726129,-0.05565,-0.53281,...,1.057915,-0.304365,0.02136,0.134737,0.821108,-0.387055,-0.573233,-0.780739,0.086665,0.463626


In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline

# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer()),
#     ('clf', LogisticRegression(solver='saga', max_iter=500))
# ])

# param_grid = {
#     'tfidf__max_features': [500, 1000, 2000],       
#     'tfidf__ngram_range': [(1, 1), (1, 2)],         
#     'tfidf__max_df': [0.75, 0.85, 1.0],              
#     'tfidf__min_df': [1, 2, 5],                    
#     'clf__C': [0.1, 1, 10]                         
# }

# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')  
# grid_search.fit(sample['lemmatized_text_str'], sample['overall'])

# print("best param:", grid_search.best_params_)
# print("best score:", grid_search.best_score_)

In [None]:
# from gensim.models import Word2Vec
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# import numpy as np

# def vectorize_text(text, model):
#     vectors = [model.wv[word] for word in text if word in model.wv]
#     if vectors:
#         return np.mean(vectors, axis=0)
#     else:
#         return np.zeros(model.vector_size)

# X_train, X_test, y_train, y_test = train_test_split(
#     sample['lemmatized_text'], sample['overall'], test_size=0.2, random_state=42
# )

# best_score = 0
# best_params = {}

# for vector_size in [50, 100, 200]:
#     for window in [3, 5, 7]:
#         for min_count in [1, 2, 5]:
#             print(f"param: vector_size={vector_size}, window={window}, min_count={min_count}")
            
#             word2vec_model = Word2Vec(
#                 sentences=X_train,
#                 vector_size=vector_size,
#                 window=window,
#                 min_count=min_count,
#                 workers=4
#             )
            
#             X_train_vectors = np.array([vectorize_text(text, word2vec_model) for text in X_train])
#             X_test_vectors = np.array([vectorize_text(text, word2vec_model) for text in X_test])
            
#             clf = RandomForestClassifier()
#             clf.fit(X_train_vectors, y_train)
#             y_pred = clf.predict(X_test_vectors)

#             score = accuracy_score(y_test, y_pred)
#             print(f"score: {score}")
            
#             if score > best_score:
#                 best_score = score
#                 best_params = {'vector_size': vector_size, 'window': window, 'min_count': min_count}

# print("best param:", best_params)
# print("best score:", best_score)


In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(
#     sample['lemmatized_text_str'], sample['overall'], test_size=0.2, random_state=42
# )

# tfidf_vectorizer = TfidfVectorizer(
#     max_features=2000,
#     ngram_range=(1, 2),
#     max_df=0.75,
#     min_df=1
# )
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

# logistic_clf = LogisticRegression(C=1, solver='saga', max_iter=500)
# logistic_clf.fit(X_train_tfidf, y_train)

# y_pred_tfidf = logistic_clf.predict(X_test_tfidf)
# accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)

# print("ac lr tfidf:", accuracy_tfidf)


In [None]:
# from gensim.models import Word2Vec
# from sklearn.ensemble import RandomForestClassifier
# import numpy as np

# word2vec_model = Word2Vec(
#     sentences=sample['lemmatized_text'],
#     vector_size=50,
#     window=7,
#     min_count=5
# )

# def vectorize_text(text, model):
#     vectors = [model.wv[word] for word in text if word in model.wv]
#     return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# X_train_word2vec = np.array([vectorize_text(text, word2vec_model) for text in X_train])
# X_test_word2vec = np.array([vectorize_text(text, word2vec_model) for text in X_test])

# random_forest_clf = RandomForestClassifier()
# random_forest_clf.fit(X_train_word2vec, y_train)

# y_pred_word2vec = random_forest_clf.predict(X_test_word2vec)
# accuracy_word2vec = accuracy_score(y_test, y_pred_word2vec)

# print("ac rf w2v:", accuracy_word2vec)


In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import StackingClassifier
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import train_test_split
# import numpy as np
# from gensim.models import Word2Vec
# from sklearn.feature_extraction.text import TfidfVectorizer

# X_train, X_test, y_train, y_test = train_test_split(
#     sample['lemmatized_text_str'], sample['overall'], test_size=0.2, random_state=42
# )

# tfidf_vectorizer = TfidfVectorizer(
#     max_features=2000,
#     ngram_range=(1, 2),
#     max_df=0.75,
#     min_df=1
# )
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

# logistic_clf = LogisticRegression(C=1, solver='saga', max_iter=500)

# word2vec_model = Word2Vec(
#     sentences=sample['lemmatized_text'],
#     vector_size=50,
#     window=7,
#     min_count=5
# )

# def vectorize_text(text, model):
#     vectors = [model.wv[word] for word in text if word in model.wv]
#     return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# X_train_word2vec = np.array([vectorize_text(text, word2vec_model) for text in X_train])
# X_test_word2vec = np.array([vectorize_text(text, word2vec_model) for text in X_test])

# random_forest_clf = RandomForestClassifier()

# stacking_clf = StackingClassifier(
#     estimators=[
#         ('logistic', logistic_clf),
#         ('random_forest', random_forest_clf)
#     ],
#     final_estimator=LogisticRegression()
# )

# stacking_clf.fit(np.hstack([X_train_tfidf.toarray(), X_train_word2vec]), y_train)

# y_pred_stacking = stacking_clf.predict(np.hstack([X_test_tfidf.toarray(), X_test_word2vec]))
# accuracy_stacking = accuracy_score(y_test, y_pred_stacking)

# print("ac stack:", accuracy_stacking)


In [None]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_absolute_error, mean_squared_error
# import numpy as np

# X_train, X_test, y_train, y_test = train_test_split(
#     sample['lemmatized_text_str'], sample['overall'], test_size=0.2, random_state=42
# )

# tfidf_vectorizer = TfidfVectorizer(
#     max_features=2000,
#     ngram_range=(1, 2),
#     max_df=0.75,
#     min_df=1
# )
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)

# word2vec_model = Word2Vec(
#     sentences=sample['lemmatized_text'],
#     vector_size=50,
#     window=7,
#     min_count=5
# )

# def vectorize_text(text, model):
#     vectors = [model.wv[word] for word in text if word in model.wv]
#     return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# X_train_word2vec = np.array([vectorize_text(text, word2vec_model) for text in X_train])
# X_test_word2vec = np.array([vectorize_text(text, word2vec_model) for text in X_test])

# X_train_combined = np.hstack([X_train_tfidf.toarray(), X_train_word2vec])
# X_test_combined = np.hstack([X_test_tfidf.toarray(), X_test_word2vec])

# regressor = RandomForestRegressor(n_estimators=100, random_state=42)
# regressor.fit(X_train_combined, y_train)

# y_pred = regressor.predict(X_test_combined)

# mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# print("MAE:", mae)
# print("MSE:", mse)

# y_pred_rounded = np.round(y_pred)
# accuracy = (y_pred_rounded == y_test).mean()
# print("ac:", accuracy)


MAE: 0.755145
MSE: 0.97393195
Точность после округления: 0.413


In [None]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.metrics import mean_absolute_error, mean_squared_error
# import numpy as np

# X_train_small, _, y_train_small, _ = train_test_split(X_train_combined, y_train, test_size=0.8, random_state=42)

# param_grid = {
#     'n_estimators': [100, 150, 200],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
# }

# rf_regressor = RandomForestRegressor(random_state=42)
# random_search = RandomizedSearchCV(
#     estimator=rf_regressor,
#     param_distributions=param_grid,
#     n_iter=10, 
#     scoring='neg_mean_absolute_error',
#     cv=3,
#     random_state=42,
#     n_jobs=-1,
#     verbose=2
# )

# random_search.fit(X_train_small, y_train_small)
# print("best param:", random_search.best_params_)
# print("mae:", -random_search.best_score_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Лучшие параметры: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 10}
Лучшее качество (MAE) на уменьшенном наборе: 0.7841623217226


In [None]:

# best_rf_regressor = random_search.best_estimator_
# best_rf_regressor.fit(X_train_combined, y_train)

# y_pred = best_rf_regressor.predict(X_test_combined)


# mae = mean_absolute_error(y_test, y_pred)
# mse = mean_squared_error(y_test, y_pred)
# rmse = np.sqrt(mse) 

# print("mae:", mae)
# print("mse:", mse)
# print("rmse:", rmse)

# y_pred_rounded = np.round(y_pred)
# accuracy = (y_pred_rounded == y_test).mean()
# print("ac:", accuracy)

MAE с оптимизированным RandomForest на полном наборе данных: 0.7927167105273655
MSE с оптимизированным RandomForest на полном наборе данных: 1.0475253521553831
RMSE с оптимизированным RandomForest на полном наборе данных: 1.023486859786379
Точность с округленными предсказаниями: 0.408
