In [0]:
# Databricks restarts the kernal after every pip install call, so we have to install everything at once
%pip install pymongo[srv] wordcloud vaderSentiment torch transformers

In [0]:
import numpy as np
import pandas as pd
import pymongo

import pyspark
from pyspark.sql import Row
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, StructType, StructField

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import transformers
from transformers import pipeline

import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [0]:
username = "admin"
password = "goal-diggers"
database = "reviewdb"
collection_name = "amazon"
ip_address = "msds697-goal-diggers.5f8jw.mongodb.net"
client = pymongo.MongoClient(f"mongodb+srv://{username}:{password}@{ip_address}")
db = client[database]
amazon_collection = db[collection_name]
amazon_collection.find_one()

In [0]:
# Akul
pipeline = [
    { "$match": { "ASIN": "B00015VKT4" } },
    { "$unwind": "$reviews" },
    { "$group": { "_id": "$reviews.rating", "count": { "$sum": 1 } } }
]

projection = {
    "ASIN": 1,
    "reviews": 1
             }
      
pipeline = [
    {"$unwind": "$reviews"},
    {"$project": projection}
]

results = amazon_collection.aggregate(pipeline)



data = []    
for doc in results:
    _id = doc['_id']
    ASIN = doc['ASIN']
    reviews = doc['reviews']['review']
    rating = doc['reviews']['rating']
    title = doc['reviews']['title']
    data.append([_id, ASIN, reviews, rating, title])

df = pd.DataFrame(data, columns=['_id', 'ASIN', 'reviews', 'rating', 'title'])
df = df.head(95000)

In [0]:
# Define the threshold for a "good" review
threshold = 4

# Convert the reviews and title columns into bag-of-words features
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['reviews'] + ' ' + df['title'])

# Convert the sparse matrix X to a dense numpy array
X = X.toarray()

# Create the target variable
y = df['rating'].apply(lambda x: 1 if x >= threshold else 0)

# Convert the pandas series y to a numpy array
y = y.values

# Define the PyTorch dataset and dataloader
class ReviewDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = ReviewDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the logistic regression model using PyTorch
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

model = LogisticRegressionModel(X.shape[1])

# Define the loss function and optimizer
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters())

# Train the model using PyTorch
for epoch in range(5):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch %d Loss: %.3f' % (epoch + 1, running_loss / len(dataloader)))

# Evaluate the model
with torch.no_grad():
    inputs = torch.tensor(X, dtype=torch.float32)
    labels = torch.tensor(y, dtype=torch.float32)
    outputs = model(inputs)
    predictions = outputs.squeeze().cpu().numpy()
    mae = np.mean(np.abs(predictions - y))
    print('MAE:', mae)

In [0]:
# Define the input example
input_example = vectorizer.transform(["this item is a piece of junk, I hate it"]).toarray()

# Convert the input example to a PyTorch tensor
input_tensor = torch.tensor(input_example, dtype=torch.float32)

# Make the prediction using the trained model
with torch.no_grad():
    output_tensor = model(input_tensor)
    prediction = output_tensor.item()

# Print the prediction
print("Prediction:", prediction)

In [0]:
# Define the input example
input_example = vectorizer.transform(["Great! I love it!"]).toarray()

# Convert the input example to a PyTorch tensor
input_tensor = torch.tensor(input_example, dtype=torch.float32)

# Make the prediction using the trained model
with torch.no_grad():
    output_tensor = model(input_tensor)
    prediction = output_tensor.item()

# Print the prediction
print("Prediction:", prediction)

In [0]:
# Patricia
# https://spacy.io/universe/project/eng_spacysentiment
!pip install eng-spacysentiment
import eng_spacysentiment
nlp = eng_spacysentiment.load()
text = "Welcome to Arsenals official YouTube channel Watch as we take you closer and show you the personality of the club"
doc = nlp(text)
print(doc.cats)
rating_pred = doc.cats['positive']
print(rating_pred)
# label = amazon_review/5
# error = rating_pred - amazon_label




In [0]:
# Patricia
row = 8
text = df['reviews'][row]
print(text)
doc = nlp(text)
rating_pred = doc.cats['positive']
print(rating_pred)
label = df['rating'][row]/5
print(label)
error = rating_pred - label
print(error)

In [0]:
# Patricia
df['rating_perc'] = df['rating']/5
df['review_nlp'] = df['reviews'].apply(nlp)
df['spacy_prediction'] = df['review_nlp'].apply(lambda row : row.cats['positive'])
df['spacy_error'] = abs(df['rating_perc']-df['spacy_prediction'])
mae = df['spacy_error'].mean()
mae # 0.31524060 

In [0]:
df[df['rating_perc']!= 1].head(4)

In [0]:
# VMK

from transformers import pipeline
# sentiment_pipeline = pipeline("sentiment-analysis")
# sentiment_pipeline = pipeline(model="finiteautomata/bertweet-base-sentiment-analysis")
sentiment_pipeline = pipeline("sentiment-analysis",model="siebert/sentiment-roberta-large-english")

In [0]:
# VMK

def m_bert(i):
    try:
        i = sentiment_pipeline(str(i))[0]
        s = i['score']/.4
        if i['label'] == 'POSITIVE':
            s += 2.5
        else:
            s = 2.5-s
        return s
    except:
         return 2.5

In [0]:
# VMK

df_s = df.head(10).copy()

df_s['m_prediction'] = df_s['reviews'].apply(m_bert)
df_s['m_error'] = abs(df_s['rating']-df_s['m_prediction'])
mae = df_s['m_error'].mean()

print(df_s.head(2).T)

mae #for the whole dataset: 0.795