In [1]:
import pandas as pd
import numpy as np
import polars as pl
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import gzip
import shutil
import pathlib
import os
import sqlalchemy
import sqlite3
import spacy
import re
import tqdm
from tqdm.notebook import tqdm, trange
import ipywidgets as widgets
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display
import time
import timeit

from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.functional import softmax

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_obj = SentimentIntensityAnalyzer()

%matplotlib inline
alt.data_transformers.disable_max_rows()

nlp = spacy.load("en_core_web_sm")

In [2]:
def print_files_in_directory(directory_path):
    with os.scandir(directory_path) as entries:
        for entry in entries:
            if entry.is_file():
                print(entry.name)

def get_sentiment_score(text):
    sentiment_scores = sid_obj.polarity_scores(text)
    return sentiment_scores

In [3]:
cleaned_reviews_dir_path = r"C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews"

cleaned_reviews_list = []

In [4]:
print_files_in_directory(cleaned_reviews_dir_path)

Cleaned_Reviews_1.csv
Cleaned_Reviews_2.csv
Cleaned_Reviews_3.csv
Cleaned_Reviews_4.csv
Cleaned_Reviews_6.csv
Cleaned_Reviews_7.csv
Cleaned_Reviews_8.csv
Cleaned_Reviews_9.csv


In [5]:
with os.scandir(cleaned_reviews_dir_path) as entries:
    for entry in entries:
        if entry.is_file():
            print(entry.path)
            reviews_data = pd.read_csv(entry.path)
            cleaned_reviews_list.append(reviews_data)

reviews_df = pd.concat(cleaned_reviews_list)

C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews\Cleaned_Reviews_1.csv
C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews\Cleaned_Reviews_2.csv
C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews\Cleaned_Reviews_3.csv
C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews\Cleaned_Reviews_4.csv
C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews\Cleaned_Reviews_6.csv
C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews\Cleaned_Reviews_7.csv
C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews\Cleaned_Reviews_8.csv
C:\Users\asl4a\PycharmProjects\pythonProject\Cleaned Reviews\Cleaned_Reviews_9.csv


In [6]:
reviews_df['comments'] = reviews_df['comments'].astype(str)
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,temp_index,listing_id,id,date,reviewer_id,reviewer_name,comments,cleaned_text,tokens
0,0,0,6422,1927,2009-04-30,14100,Melissa,I can't say enough about how wonderful it was ...,i cant say enough about how wonderful it was t...,"['not', 'wonderful', 'stay', 'highlight', 'sta..."
1,1,1,6422,3867,2009-06-11,17413,Raquel,Michelle and Collier's home is wonderful! They...,michelle and colliers home is wonderful they a...,"['michelle', 'collier', 'home', 'wonderful', '..."
2,2,2,6422,4159,2009-06-17,20253,Ulrike,I spent one night at Michele's home and felt j...,i spent one night at micheles home and felt ju...,"['spend', 'night', 'micheles', 'home', 'feel',..."
3,3,3,6422,5724,2009-07-18,22544,Phil,Michele and Collier are two of the loveliest p...,michele and collier are two of the loveliest p...,"['michele', 'collier', 'lovely', 'people', 'pl..."
4,4,4,6422,11891,2009-09-29,33409,Claire,We had the most lovely time staying with Miche...,we had the most lovely time staying with miche...,"['lovely', 'time', 'stay', 'michele', 'colly',..."


In [7]:
reviews_df.dtypes

Unnamed: 0        int64
temp_index        int64
listing_id        int64
id                int64
date             object
reviewer_id       int64
reviewer_name    object
comments         object
cleaned_text     object
tokens           object
dtype: object

In [8]:
len(reviews_df)

10668047

In [9]:
reviews_df.to_csv("Reviews_df.csv")

In [8]:
|if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)
#a = torch.zeros(4,3)
#a = a.to(device)

In [9]:
print(device)

cuda:0


In [10]:
start_time = time.time()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
model.to(device)

seed = 966
torch.manual_seed(seed)

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors='pt',truncation=True,padding=True)
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)

        # Apply softmax to get probabilities
        probs = softmax(outputs.logits, dim=-1)

        # The second index corresponds to the positive class in sentiment analysis
        positive_probability = probs[0,1].item()

        return positive_probability


# This hasn't been run yet
reviews_df['bert_sentiments'] = reviews_df['comments'][10000000:].apply(get_sentiment_score)

# Use for testing
#reviews_df['bert_sentiments'] = reviews_df['comments'][:10000].apply(get_sentiment_score)

#print(device)
end_time = time.time()
print(end_time - start_time)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


4557.297260046005


In [11]:
reviews_df2 = pd.DataFrame(reviews_df[['Unnamed: 0', 'bert_sentiments']])

In [12]:
reviews_df2.to_csv('reviews_df2_processed_16.csv')

In [13]:
len(reviews_df)

10668047

In [22]:
reviews_df

Unnamed: 0.1,Unnamed: 0,temp_index,listing_id,id,date,reviewer_id,reviewer_name,comments,cleaned_text,tokens,bert_sentiments
0,0,0,6422,1927,2009-04-30,14100,Melissa,I can't say enough about how wonderful it was ...,i cant say enough about how wonderful it was t...,"['not', 'wonderful', 'stay', 'highlight', 'sta...",
1,1,1,6422,3867,2009-06-11,17413,Raquel,Michelle and Collier's home is wonderful! They...,michelle and colliers home is wonderful they a...,"['michelle', 'collier', 'home', 'wonderful', '...",
2,2,2,6422,4159,2009-06-17,20253,Ulrike,I spent one night at Michele's home and felt j...,i spent one night at micheles home and felt ju...,"['spend', 'night', 'micheles', 'home', 'feel',...",
3,3,3,6422,5724,2009-07-18,22544,Phil,Michele and Collier are two of the loveliest p...,michele and collier are two of the loveliest p...,"['michele', 'collier', 'lovely', 'people', 'pl...",
4,4,4,6422,11891,2009-09-29,33409,Claire,We had the most lovely time staying with Miche...,we had the most lovely time staying with miche...,"['lovely', 'time', 'stay', 'michele', 'colly',...",
...,...,...,...,...,...,...,...,...,...,...,...
1668040,9668040,1459400,904459169206385872,906632098517736492,2023-06-04,233078624,Chien-Hao,A very smooth stay in this place. The host is ...,a very smooth stay in this place the host is v...,"['smooth', 'stay', 'place', 'host', 'responsiv...",
1668041,9668041,1459401,902262639204577592,905911996152427208,2023-06-03,93526779,Celeine,The unit was new and very clean. It was a litt...,the unit was new and very clean it was a littl...,"['unit', 'new', 'clean', 'little', 'tricky', '...",
1668042,9668042,1459402,902275771323672573,904510451196355984,2023-06-01,517637293,Maria,"Everything was very easy, quick to book, and t...",everything was very easy quick to book and the...,"['easy', 'quick', 'book', 'place', 'clean', 'e...",
1668043,9668043,1459403,902279125007633591,906577547506923550,2023-06-04,99532747,Andres,This was a great space for two people! Its sma...,this was a great space for two people its smal...,"['great', 'space', 'people', 'small', 'effecti...",


In [23]:
torch.cuda.is_available()

True

In [15]:
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)
a = torch.zeros(4,3)
a = a.to(device)

In [15]:
# device = torch.device('cuda')
# print(torch.cuda.is_available())

False


In [16]:
#torch.cuda.current_device()

0

In [13]:
# cuda_id = torch.cuda.current_device()
# torch.cuda.get_device_name(cuda_id)

'NVIDIA GeForce RTX 3070'

In [14]:
# torch.version.cuda

'12.1'

In [17]:
#print(device)

cuda:0


In [None]:
reviews_df.to_csv('reviews_df_processed_1.csv')

In [10]:
reviews_df['bert_sentiments']

0          0.595194
1          0.571474
2          0.556534
3          0.574392
4          0.587863
             ...   
1668040         NaN
1668041         NaN
1668042         NaN
1668043         NaN
1668044         NaN
Name: bert_sentiments, Length: 10668047, dtype: float64

In [11]:
len(reviews_df['comments'])

10668047