In [1]:
import pandas as pd
import numpy as np
import polars as pl
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import gzip
import shutil
import pathlib
import os
import sqlalchemy
import sqlite3
import spacy
import re
import spacy_cleaner
from spacy_cleaner import processing, Cleaner
import re
import tqdm
from tqdm.notebook import tqdm, trange
import ipywidgets as widgets
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display
import time
import timeit
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor


from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.functional import softmax

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sid_obj = SentimentIntensityAnalyzer()

%matplotlib inline
alt.data_transformers.disable_max_rows()


#spacy.prefer_gpu()
spacy.require_gpu(gpu_id=0)
nlp = spacy.load("en_core_web_sm")

# This loads a larger and more robust model. Use with caution though because it takes considerably longer to run
#nlp = spacy.load("en_core_web_trf")

#nlp = spacy.load('/path/to/en_core_web_sm')

--------------------------------------------------------------------------------

  CuPy may not function correctly because multiple CuPy packages are installed
  in your environment:

    cupy-cuda11x, cupy-cuda12x

  Follow these steps to resolve this issue:

    1. For all packages listed above, run the following command to remove all
       existing CuPy installations:

         $ pip uninstall <package_name>

      If you previously installed CuPy via conda, also run the following:

         $ conda uninstall cupy

    2. Install the appropriate CuPy package.
       Refer to the Installation Guide for detailed instructions.

         https://docs.cupy.dev/en/stable/install.html

--------------------------------------------------------------------------------



In [2]:
def print_files_in_directory(directory_path):
    with os.scandir(directory_path) as entries:
        for entry in entries:
            if entry.is_file():
                print(entry.name)

def get_sentiment_score(text):
    sentiment_scores = sid_obj.polarity_scores(text)
    return sentiment_scores

def clean_text(text):
    doc = nlp(text)

    cleaned_tokens = [token.lemma_.lower().strip() for token in doc if not token.is_punct and not token.is_space]
    cleaned_tokens = [token for token in cleaned_tokens if not nlp.vocab[token].is_stop]
    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

def remove_numbers(text):
    cleaner = Cleaner(
        nlp,
        processing.remove_number_token
    )
    return cleaner.clean(text)

def remove_numbers_regex(text):
    # Pattern to remove numbers from text data
    pattern = r"\d+"

    return re.sub(pattern,"",text)

In [3]:
conn = sqlite3.connect(r"C:/Users/asl4a/AirBnB_Data.db")
cursor = conn.cursor()

In [4]:
sql = """
SELECT
    r.id,
    r.temp_index,
    r.comments,
    l.id,
    l.review_scores_rating,
    l.review_scores_accuracy,
    l.review_scores_cleanliness,
    l.review_scores_checkin,
    l.review_scores_communication,
    l.review_scores_location,
    l.review_scores_value
FROM
    listings l
JOIN reviews r ON l.id = r.id;"""

df_revs = pd.read_sql(sql,con=conn)

In [5]:
df_revs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 843 entries, 0 to 842
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           843 non-null    int64  
 1   temp_index                   843 non-null    int64  
 2   comments                     843 non-null    object 
 3   id                           843 non-null    int64  
 4   review_scores_rating         745 non-null    float64
 5   review_scores_accuracy       739 non-null    float64
 6   review_scores_cleanliness    739 non-null    float64
 7   review_scores_checkin        739 non-null    float64
 8   review_scores_communication  739 non-null    float64
 9   review_scores_location       739 non-null    float64
 10  review_scores_value          739 non-null    float64
dtypes: float64(7), int64(3), object(1)
memory usage: 72.6+ KB


In [7]:
df_revs['cleaned_text'] = df_revs['comments'].apply(remove_numbers_regex)
df_revs['comments'] = df_revs['comments'].astype(str)

In [8]:
beg_slice = 0


total_iterations = len(df_revs['cleaned_text'][beg_slice:])
progress_bar = tqdm(total=total_iterations,desc='Processing text data...')

cleaned_text_list = []

for i in df_revs['cleaned_text'][beg_slice:]:
    cleaned_text_list.append(clean_text(i))
    progress_bar.update(1)

progress_bar.close()

Processing text data...:   0%|          | 0/843 [00:00<?, ?it/s]

In [9]:
df_revs['processed_text'] = pd.Series(cleaned_text_list)

In [12]:
df_revs = df_revs.dropna(how='any',axis=0)

In [14]:
df_revs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 739 entries, 0 to 842
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           739 non-null    int64  
 1   temp_index                   739 non-null    int64  
 2   comments                     739 non-null    object 
 3   id                           739 non-null    int64  
 4   review_scores_rating         739 non-null    float64
 5   review_scores_accuracy       739 non-null    float64
 6   review_scores_cleanliness    739 non-null    float64
 7   review_scores_checkin        739 non-null    float64
 8   review_scores_communication  739 non-null    float64
 9   review_scores_location       739 non-null    float64
 10  review_scores_value          739 non-null    float64
 11  cleaned_text                 739 non-null    object 
 12  processed_text               739 non-null    object 
dtypes: float64(7), int64(3), 

In [18]:
from sklearn.metrics import mean_squared_error

vectorizer = TfidfVectorizer()

X_train, X_test, y_train, y_test = train_test_split(df_revs['processed_text'],df_revs['review_scores_rating'],test_size=0.2,random_state=42)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

regressor = RandomForestRegressor()

regressor.fit(X_train_tfidf, y_train)

y_pred = regressor.predict(X_test_tfidf)

mse = mean_squared_error(y_test,y_pred)

In [19]:
mse

0.19417794675675684

In [27]:
regressor.feature_importances_

array([0.00017321, 0.        , 0.00088891, ..., 0.        , 0.        ,
       0.        ])

In [31]:
feature_names = vectorizer.get_feature_names_out()

features_df = pd.DataFrame({'Feature': feature_names,'Feature_Weights': regressor.feature_importances_})

In [36]:
features_df.sort_values(by='Feature_Weights',ascending=False)[:30]

Unnamed: 0,Feature,Feature_Weights
502,bustling,0.04111
3345,th,0.039701
3105,soak,0.03538
28,accord,0.032165
3229,street,0.024725
3202,staying,0.023151
2020,locality,0.019576
1841,joes,0.016946
1585,herald,0.016578
2404,open,0.016106
