In [8]:
import pandas as pd
import numpy as np
import polars as pl
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import gzip
import shutil
import pathlib
import os
import sqlalchemy
import sqlite3
import spacy
import re
import spacy_cleaner
from spacy_cleaner import processing, Cleaner
import re
import tqdm
from tqdm.notebook import tqdm, trange
import ipywidgets as widgets
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display
import time
import timeit
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
import cupy as cp
import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.functional import softmax

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_obj = SentimentIntensityAnalyzer()

%matplotlib inline
alt.data_transformers.disable_max_rows()


#spacy.prefer_gpu()
spacy.require_gpu(gpu_id=0)
nlp = spacy.load("en_core_web_sm")

# This loads a larger and more robust model. Use with caution though because it takes considerably longer to run
#nlp = spacy.load("en_core_web_trf")

#nlp = spacy.load('/path/to/en_core_web_sm')

In [2]:
def print_files_in_directory(directory_path):
    with os.scandir(directory_path) as entries:
        for entry in entries:
            if entry.is_file():
                print(entry.name)

def get_sentiment_score(text):
    sentiment_scores = sid_obj.polarity_scores(text)
    return sentiment_scores

def clean_text(text):
    doc = nlp(text)

    cleaned_tokens = [token.lemma_.lower().strip() for token in doc if not token.is_punct and not token.is_space]
    cleaned_tokens = [token for token in cleaned_tokens if not nlp.vocab[token].is_stop]
    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

def remove_numbers(text):
    cleaner = Cleaner(
        nlp,
        processing.remove_number_token
    )
    return cleaner.clean(text)

def remove_numbers_regex(text):
    # Pattern to remove numbers from text data
    pattern = r"\d+"

    return re.sub(pattern,"",text)

In [3]:
conn = sqlite3.connect(r"C:/Users/asl4a/AirBnB_Data.db")
cursor = conn.cursor()

In [8]:
cleaned_data_dir_path = r"F:\Data Science\Datasets\Cleaned Reviews"

cleaned_reviews_list = []

with os.scandir(cleaned_data_dir_path) as entries:
    for entry in entries:
        if entry.is_file():
            cleaned_reviews_list.append(pd.read_csv(entry.path))

cleaned_revs_df = pd.concat(cleaned_reviews_list)
cleaned_revs_df.head()

Unnamed: 0.1,Unnamed: 0,temp_index,listing_id,id,date,reviewer_id,reviewer_name,comments,cleaned_text,tokens
0,0,0,6422,1927,2009-04-30,14100,Melissa,I can't say enough about how wonderful it was ...,i cant say enough about how wonderful it was t...,"['not', 'wonderful', 'stay', 'highlight', 'sta..."
1,1,1,6422,3867,2009-06-11,17413,Raquel,Michelle and Collier's home is wonderful! They...,michelle and colliers home is wonderful they a...,"['michelle', 'collier', 'home', 'wonderful', '..."
2,2,2,6422,4159,2009-06-17,20253,Ulrike,I spent one night at Michele's home and felt j...,i spent one night at micheles home and felt ju...,"['spend', 'night', 'micheles', 'home', 'feel',..."
3,3,3,6422,5724,2009-07-18,22544,Phil,Michele and Collier are two of the loveliest p...,michele and collier are two of the loveliest p...,"['michele', 'collier', 'lovely', 'people', 'pl..."
4,4,4,6422,11891,2009-09-29,33409,Claire,We had the most lovely time staying with Miche...,we had the most lovely time staying with miche...,"['lovely', 'time', 'stay', 'michele', 'colly',..."


In [9]:
cleaned_revs_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10668047 entries, 0 to 1668044
Data columns (total 10 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   Unnamed: 0     int64 
 1   temp_index     int64 
 2   listing_id     int64 
 3   id             int64 
 4   date           object
 5   reviewer_id    int64 
 6   reviewer_name  object
 7   comments       object
 8   cleaned_text   object
 9   tokens         object
dtypes: int64(5), object(5)
memory usage: 895.3+ MB


In [10]:
print(len(cleaned_revs_df.drop_duplicates()))

9668066


In [7]:
sql = """
SELECT
    l.host_id,
    l.id,
    l.review_scores_rating,
    l.review_scores_accuracy,
    l.review_scores_cleanliness,
    l.review_scores_checkin,
    l.review_scores_communication,
    l.review_scores_location,
    l.review_scores_value
FROM
    listings l;
    """

listings_df = pd.read_sql(sql=sql,con=conn)

In [5]:
listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228701 entries, 0 to 228700
Data columns (total 9 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   host_id                      228701 non-null  int64  
 1   id                           228701 non-null  int64  
 2   review_scores_rating         179233 non-null  float64
 3   review_scores_accuracy       178325 non-null  float64
 4   review_scores_cleanliness    178339 non-null  float64
 5   review_scores_checkin        178313 non-null  float64
 6   review_scores_communication  178333 non-null  float64
 7   review_scores_location       178306 non-null  float64
 8   review_scores_value          178305 non-null  float64
dtypes: float64(7), int64(2)
memory usage: 15.7 MB


In [11]:
df_revs = pd.concat([listings_df,cleaned_revs_df],keys=['listing_id','id'])
#df_revs = pd.merge(listings_df,cleaned_revs_df,right_on='listing_id',left_on='id)

In [12]:
df_revs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10896748 entries, ('listing_id', 0) to ('id', 1668044)
Data columns (total 18 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   host_id                      float64
 1   id                           int64  
 2   review_scores_rating         float64
 3   review_scores_accuracy       float64
 4   review_scores_cleanliness    float64
 5   review_scores_checkin        float64
 6   review_scores_communication  float64
 7   review_scores_location       float64
 8   review_scores_value          float64
 9   Unnamed: 0                   float64
 10  temp_index                   float64
 11  listing_id                   float64
 12  date                         object 
 13  reviewer_id                  float64
 14  reviewer_name                object 
 15  comments                     object 
 16  cleaned_text                 object 
 17  tokens                       object 
dtypes: float64(12), i

In [13]:
df_revs['cleaned_text'] = df_revs['cleaned_text'].astype(str)

In [50]:
#df_revs = df_revs.dropna(how='any',axis=0)

In [14]:
# We have to rename the key in the listings dataset because when pandas merge function looks to see if there are keys with matching names in each dataset and was throwing errors when I tried to match on two different named keys even though they were the same values
# Obviously this isn't the same as with raw SQL joins
listings_df = listings_df.rename(columns={'id': 'listing_id'})

In [15]:
df_revs = pd.merge(listings_df,cleaned_revs_df,how='left',on=['listing_id','listing_id'])

In [16]:
df_revs.isnull().sum() / len(df_revs) * 100


host_id                        0.000000
listing_id                     0.000000
review_scores_rating           0.461560
review_scores_accuracy         0.472841
review_scores_cleanliness      0.472673
review_scores_checkin          0.472990
review_scores_communication    0.472747
review_scores_location         0.473102
review_scores_value            0.473111
Unnamed: 0                     0.461560
temp_index                     0.461560
id                             0.461560
date                           0.461560
reviewer_id                    0.461560
reviewer_name                  0.461709
comments                       0.487153
cleaned_text                   0.909656
tokens                         0.461560
dtype: float64

In [17]:
df_revs = df_revs.dropna(how='any',axis=0)
#df_revs = df_revs.replace(np.nan, '')

In [18]:
df_revs

Unnamed: 0.1,host_id,listing_id,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,Unnamed: 0,temp_index,id,date,reviewer_id,reviewer_name,comments,cleaned_text,tokens
0,12172,6422,4.95,4.94,4.96,4.97,4.96,4.92,4.98,0.0,0.0,1.927000e+03,2009-04-30,14100.0,Melissa,I can't say enough about how wonderful it was ...,i cant say enough about how wonderful it was t...,"['not', 'wonderful', 'stay', 'highlight', 'sta..."
1,12172,6422,4.95,4.94,4.96,4.97,4.96,4.92,4.98,1.0,1.0,3.867000e+03,2009-06-11,17413.0,Raquel,Michelle and Collier's home is wonderful! They...,michelle and colliers home is wonderful they a...,"['michelle', 'collier', 'home', 'wonderful', '..."
2,12172,6422,4.95,4.94,4.96,4.97,4.96,4.92,4.98,2.0,2.0,4.159000e+03,2009-06-17,20253.0,Ulrike,I spent one night at Michele's home and felt j...,i spent one night at micheles home and felt ju...,"['spend', 'night', 'micheles', 'home', 'feel',..."
3,12172,6422,4.95,4.94,4.96,4.97,4.96,4.92,4.98,3.0,3.0,5.724000e+03,2009-07-18,22544.0,Phil,Michele and Collier are two of the loveliest p...,michele and collier are two of the loveliest p...,"['michele', 'collier', 'lovely', 'people', 'pl..."
4,12172,6422,4.95,4.94,4.96,4.97,4.96,4.92,4.98,4.0,4.0,1.189100e+04,2009-09-29,33409.0,Claire,We had the most lovely time staying with Miche...,we had the most lovely time staying with miche...,"['lovely', 'time', 'stay', 'michele', 'colly',..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10717282,31058129,903053207440059523,5.00,5.00,5.00,5.00,5.00,5.00,5.00,9668038.0,1459398.0,9.059651e+17,2023-06-03,266498612.0,Paul,"Jordan is a great host, is super communicative...",jordan is a great host is super communicative ...,"['jordan', 'great', 'host', 'super', 'communic..."
10717283,31058129,903053207440059523,5.00,5.00,5.00,5.00,5.00,5.00,5.00,9668039.0,1459399.0,9.073184e+17,2023-06-05,754306.0,Katerine,I had an absolutely wonderful stay at Jordan’s...,i had an absolutely wonderful stay at jordans ...,"['absolutely', 'wonderful', 'stay', 'jordans',..."
10717321,134981485,904459169206385872,5.00,5.00,5.00,5.00,5.00,5.00,5.00,9668040.0,1459400.0,9.066321e+17,2023-06-04,233078624.0,Chien-Hao,A very smooth stay in this place. The host is ...,a very smooth stay in this place the host is v...,"['smooth', 'stay', 'place', 'host', 'responsiv..."
10717359,364080225,903836186965131941,5.00,5.00,5.00,5.00,5.00,5.00,5.00,9668036.0,1459396.0,9.066552e+17,2023-06-04,73264277.0,Catherine,Nice private room with private bathroom. Easy ...,nice private room with private bathroom easy t...,"['nice', 'private', 'room', 'private', 'bathro..."


In [119]:
import xgboost

#regressor = XGBRegressor(device='cuda',tree_method='hist')
#regressor = XGBRegressor(objective='reg:squarederror')
regressor = XGBRegressor()
beg_slice = 0
end_slice = 1000

In [120]:
start_time = time.time()

vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1,4))

X_train, X_test, y_train, y_test = train_test_split(df_revs['cleaned_text'][beg_slice:end_slice],df_revs['review_scores_rating'][beg_slice:end_slice],test_size=0.2,random_state=42)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

#regressor = RandomForestRegressor()

regressor.fit(X_train_tfidf, y_train)

y_pred = regressor.predict(X_test_tfidf)

mse = mean_squared_error(y_test,y_pred)

end_time = time.time()
print(end_time - start_time)

1.697803020477295


In [121]:
mse

8.088689251055205e-06

In [118]:
feature_names = vectorizer.get_feature_names_out()

features_df = pd.DataFrame({'Feature': feature_names,'Feature_Weights': regressor.feature_importances_})
features_df.sort_values(by='Feature_Weights',ascending=False)[:30]

Unnamed: 0,Feature,Feature_Weights
554625,bobby,0.023559
1798424,kate,0.017451
547390,blake,0.016789
547788,blakes,0.010945
2019894,mike,0.010062
1799017,kates,0.009679
1989276,mary,0.00795
957873,donald,0.007246
870726,daphne,0.007093
885182,debra,0.006685


In [130]:


xgb_model = XGBRegressor(device='cuda',tree_method='hist')

tfidf_vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1,4))

X = df_revs['cleaned_text']
y = df_revs['review_scores_rating']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=42)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

X_test_gpu = cp.array(X_test_tfidf.toarray())
X_val_gpu = cp.array(X_val_tfidf.toarray())
y_test_gpu = cp.array(y_test)
y_val_gpu = cp.array(y_val)


batch_size = 10000

num_batches = int(np.ceil(X_train_tfidf.shape[0] / batch_size))

start_idx = 0
end_idx = min(batch_size, X_train_tfidf.shape[0])

X_train_batch_gpu = cp.array(X_train_tfidf[start_idx:end_idx].toarray())
y_train_batch_gpu = cp.array(y_train[start_idx:end_idx])
xgb_model.fit(X_train_batch_gpu, y_train_batch_gpu,eval_set=[(X_val_gpu,y_val_gpu)],eval_metric='rmse',early_stopping_rounds=10,verbose=True)

print("Initial model fitting complete...")

for i in range(1,num_batches):
    print("Beginning to iterate over batches")
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_train_tfidf.shape[0])

    X_train_batch_gpu = cp.array(X_train_tfidf[start_idx:end_idx].toarray())
    y_train_batch_gpu = cp.array(y_train[start_idx:end_idx])

    xgb_model.fit(X_train_batch_gpu,y_train_batch_gpu,xgb_model.get_booster().best_iteration,eval_set=[(X_val_gpu,y_val_gpu)],eval_metric='rmse',early_stopping_rounds=10,verbose=True)

    y_pred_gpu = xgb_model.predict(X_test_gpu)

    rmse = np.sqrt(mean_squared_error(y_test_gpu.get(), y_pred_gpu))
    print(f"Batch {i+1}/{num_batches} - RMSE: {rmse}")

y_pred_gpu = xgb_model.predict(X_test_gpu)
final_rmse = np.sqrt(mean_squared_error(y_test_gpu.get(), y_pred_gpu))

KeyboardInterrupt: 

In [None]:
xgb_model = XGBRegressor(device='cuda',tree_method='hist')

tfidf_vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1,4))

X = df_revs['cleaned_text']
y = df_revs['review_scores_rating']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=42)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

X_test_gpu = cp.array(X_test_tfidf.toarray())
X_val_gpu = cp.array(X_val_tfidf.toarray())
y_test_gpu = cp.array(y_test)
y_val_gpu = cp.array(y_val)


batch_size = 10000

num_batches = int(np.ceil(X_train_tfidf.shape[0] / batch_size))

start_idx = 0
end_idx = min(batch_size, X_train_tfidf.shape[0])

X_train_batch_gpu = cp.array(X_train_tfidf[start_idx:end_idx].toarray())
y_train_batch_gpu = cp.array(y_train[start_idx:end_idx])
xgb_model.fit(X_train_batch_gpu, y_train_batch_gpu,eval_set=[(X_val_gpu,y_val_gpu)],eval_metric='rmse',early_stopping_rounds=10,verbose=True)

print("Initial model fitting complete...")

for i in range(1,num_batches):
    print("Beginning to iterate over batches")
    start_idx = i * batch_size
    end_idx = min((i+1) * batch_size, X_train_tfidf.shape[0])

    X_train_batch_gpu = cp.array(X_train_tfidf[start_idx:end_idx].toarray())
    y_train_batch_gpu = cp.array(y_train[start_idx:end_idx])

    xgb_model.fit(X_train_batch_gpu,y_train_batch_gpu,xgb_model.get_booster().best_iteration,eval_set=[(X_val_gpu,y_val_gpu)],eval_metric='rmse',early_stopping_rounds=10,verbose=True)

    y_pred_gpu = xgb_model.predict(X_test_gpu)

    rmse = np.sqrt(mean_squared_error(y_test_gpu.get(), y_pred_gpu))
    print(f"Batch {i+1}/{num_batches} - RMSE: {rmse}")

y_pred_gpu = xgb_model.predict(X_test_gpu)
final_rmse = np.sqrt(mean_squared_error(y_test_gpu.get(), y_pred_gpu))

In [9]:
def read_data_in_batches(file_path,read_batch_size,file_type):
    if file_type == 'csv':
        reader = pd.read_csv(file_path,chunksize=read_batch_size)
        for batch_df in reader:
            yield batch_df
    if file_type == 'parquet':
        pd.read_parquet(file_path,chunksize=read_batch_size)
        for batch_df in reader:
            yield batch_df
    else:
        "No file type specified. Please specify either csv or parquet"


In [14]:
parquet_file_path = r"F:\Data Science\Datasets\AirBnB Reviews Regression Inputs\combined_processed_reviews.parquet"
csv_file_path = r"F:\Data Science\Datasets\AirBnB Reviews Regression Inputs\combined_processed_reviews.csv"

file_type = 'csv'
if file_type == 'parquet':
    file_path = r"F:\Data Science\Datasets\AirBnB Reviews Regression Inputs\combined_processed_reviews.parquet"
if file_type == 'csv':
    file_path = r"F:\Data Science\Datasets\AirBnB Reviews Regression Inputs\combined_processed_reviews.csv"

read_batch_size = 5000
batch_size = 1000

xgb_model = XGBRegressor(device='cuda',tree_method='hist')

tfidf_vectorizer = TfidfVectorizer(analyzer='word',ngram_range=(1,4))


for b, batch_df in enumerate(read_data_in_batches(file_path=file_path, read_batch_size=read_batch_size, file_type=file_type)):
    X = batch_df['cleaned_text']
    y = batch_df['review_scores_rating']

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=42)

    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    X_val_tfidf = tfidf_vectorizer.transform(X_val)

    X_test_gpu = cp.array(X_test_tfidf.toarray())
    X_val_gpu = cp.array(X_val_tfidf.toarray())
    y_test_gpu = cp.array(y_test)
    y_val_gpu = cp.array(y_val)


    num_batches = int(np.ceil(X_train_tfidf.shape[0] / batch_size))

    start_idx = 0
    end_idx = min(batch_size, X_train_tfidf.shape[0])

    X_train_batch_gpu = cp.array(X_train_tfidf[start_idx:end_idx].toarray())
    y_train_batch_gpu = cp.array(y_train[start_idx:end_idx])

    if b == 0:
        xgb_model.fit(X_train_batch_gpu, y_train_batch_gpu,eval_set=[(X_val_gpu,y_val_gpu)],eval_metric='rmse',early_stopping_rounds=10,verbose=True)
    else:
        pass

    for i in range(1,num_batches):
        print("Beginning to iterate over batches")
        start_idx = i * batch_size
        end_idx = min((i+1) * batch_size, X_train_tfidf.shape[0])

        X_train_batch_gpu = cp.array(X_train_tfidf[start_idx:end_idx].toarray())
        y_train_batch_gpu = cp.array(y_train[start_idx:end_idx])

        xgb_model.fit(X_train_batch_gpu,y_train_batch_gpu,xgb_model.get_booster().best_iteration,eval_set=[(X_val_gpu,y_val_gpu)],eval_metric='rmse',early_stopping_rounds=10,verbose=True)

        y_pred_gpu = xgb_model.predict(X_test_gpu)

        rmse = np.sqrt(mean_squared_error(y_test_gpu.get(), y_pred_gpu))
        print(f"Batch {i+1}/{num_batches} - RMSE: {rmse}")

    print('MODEL EVAL FOR THIS BATCH')
    y_pred_gpu = xgb_model.predict(X_test_gpu)
    final_rmse = np.sqrt(mean_squared_error(y_test_gpu.get(), y_pred_gpu))
    print(final_rmse)

if file_type == 'csv':
    df_revs = pd.read_csv(file_path)
if file_type == 'parquet':
    df_revs = pd.read_parquet(file_path)

X = df_revs['cleaned_text']
y = df_revs['review_scores_rating']

X_test_tfidf = tfidf_vectorizer.transform(X)
X_test_gpu = cp.array(X_test_tfidf.toarray())
y_test_gpu = cp.array(y)

y_pred_gpu = xgb_model.predict(X_test_gpu)
final_rmse = np.sqrt(mean_squared_error(y_test_gpu.get(), y_pred_gpu))
print(f"Final RMSE: {final_rmse}")

OutOfMemoryError: Out of memory allocating 2,044,680,192 bytes (allocated so far: 14,960,867,840 bytes).