# Yelp Review Geolocation - Naive Bayes Classifier

This serves as a baseline for the other models, ensuring that the data can be used to predict locations to some degree of success. Instead of using our own features like we did in class, we elected to use the Scikit-learn library and its associated functions. This way, these results could be built upon rather than focused on for a larger portion of the project.

## Libraries

In [156]:
# Installations (if needed)

# pip install pandas
# pip install scikit-learn

In [158]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [160]:
print("Pandas version: {}".format(pd.__version__))
print("Scikit-learn version: {}".format(sklearn.__version__))

Pandas version: 2.2.2
Scikit-learn version: 1.4.2


## Data

In [163]:
df_region_4 = pd.read_csv("balanced_reviews40000cased.csv")

# df_region_4.head

In [164]:
df_region_2 = pd.read_csv("balanced_reviews40000cased.csv")

# df_region_2.head

In [169]:
df_state = pd.read_csv("balancedRegionReviews.csv")

# df_state.head

## Implementation

### Region-based

#### 4 regions

**Hyperparameters:**
- test_size = 0.2
- random_state = 40
- stop_words = "english"
- max_features = 20000

In [175]:
# Encode region labels
df_region_4["label_id"] = df_region_4["region"].astype("category").cat.codes
label_names = df_region_4["region"].astype("category").cat.categories

# df_region_4.head

In [177]:
A_train, A_test, b_train, b_test = train_test_split(df_region_4["text"], df_region_4["label_id"], test_size=0.2, random_state=40)

# TF-IDF w/stop word removal and feature limit
vectorizer = TfidfVectorizer(stop_words="english", max_features=20000)
A_train_vec = vectorizer.fit_transform(A_train)
A_test_vec = vectorizer.transform(A_test)

# Train model
clf = MultinomialNB()
clf.fit(A_train_vec, b_train)

# Evaluate model
b_pred = clf.predict(A_test_vec)
print(classification_report(b_test, b_pred, target_names=label_names))

              precision    recall  f1-score   support

     Midwest       0.43      0.51      0.47      1963
   Northeast       0.53      0.48      0.51      2007
       South       0.54      0.47      0.50      2013
        West       0.49      0.51      0.50      2017

    accuracy                           0.49      8000
   macro avg       0.50      0.49      0.49      8000
weighted avg       0.50      0.49      0.49      8000



In [178]:
# Confusion matrix
confusion_matrix(b_test, b_pred)

array([[ 998,  292,  289,  384],
       [ 439,  973,  257,  338],
       [ 447,  269,  948,  349],
       [ 422,  309,  267, 1019]], dtype=int64)

#### 2 regions

Hyperparameters:

- test_size = 0.2
- random_state = 40
- stop_words = "english"
- max_features = 20000

In [183]:
# Encode region labels
df_region_2["label_id"] = df_region["merged_region"].astype("category").cat.codes
label_names = df_region["merged_region"].astype("category").cat.categories

# df_region_2.head

In [185]:
C_train, C_test, d_train, d_test = train_test_split(df_region_2["text"], df_region_2["label_id"], test_size=0.2, random_state=40)

# TF-IDF with stop word removal and feature limit
vectorizer = TfidfVectorizer(stop_words="english", max_features=20000)
C_train_vec = vectorizer.fit_transform(X_train)
C_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes model
clf = MultinomialNB()
clf.fit(C_train_vec, d_train)

# Evaluate
d_pred = clf.predict(C_test_vec)
print(classification_report(d_test, d_pred, target_names=label_names))

              precision    recall  f1-score   support

        East       0.50      0.49      0.49      4020
        West       0.50      0.51      0.50      3980

    accuracy                           0.50      8000
   macro avg       0.50      0.50      0.50      8000
weighted avg       0.50      0.50      0.50      8000



In [209]:
# Confusion matrix
confusion_matrix(d_test, d_pred)

array([[1959, 2061],
       [1956, 2024]], dtype=int64)

### State-based

Hyperparameters:

- test_size = 0.2
- random_state = 40
- stop_words = "english"
- max_features = 20000

In [195]:
# Encode state labels
df_state["label_id"] = df_state["state"].astype("category").cat.codes
label_names = df_state["state"].astype("category").cat.categories

# df_state.head

In [203]:
X_train, X_test, y_train, y_test = train_test_split(df_state["text"], df_state["label_id"], test_size=0.2, random_state=40)

# TF-IDF with stop word removal and feature limit
vectorizer = TfidfVectorizer(stop_words="english", max_features=20000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes model
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# Evaluate
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=label_names, zero_division=0))

              precision    recall  f1-score   support

          AZ       0.88      0.02      0.04       670
          CA       1.00      0.04      0.07       508
          DE       0.00      0.00      0.00        66
          FL       0.72      0.06      0.10       932
          ID       0.00      0.00      0.00       222
          IL       0.00      0.00      0.00       100
          IN       0.80      0.08      0.14       923
          LA       0.90      0.12      0.22       573
          MO       0.76      0.10      0.18       940
          NJ       0.00      0.00      0.00       288
          NV       0.95      0.03      0.06       617
          PA       0.22      1.00      0.36      1653
          TN       1.00      0.01      0.02       508

    accuracy                           0.25      8000
   macro avg       0.56      0.11      0.09      8000
weighted avg       0.65      0.25      0.15      8000



In [201]:
# Confusion matrix
confusion_matrix(y_test, y_pred)

array([[  16,    0,    0,    4,    0,    0,    1,    0,    2,    0,    0,
         596,    0],
       [   2,   23,    0,    3,    0,    0,    0,    1,    2,    0,    0,
         499,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          82,    0],
       [   0,    0,    0,   63,    0,    0,    1,    3,    4,    0,    0,
         839,    0],
       [   0,    0,    0,    0,    0,    0,    3,    0,    0,    0,    0,
         217,    0],
       [   1,    0,    0,    1,    0,    0,    0,    0,    3,    0,    0,
          83,    0],
       [   1,    0,    0,    2,    0,    0,   63,    1,   11,    0,    0,
         909,    0],
       [   0,    0,    0,    0,    0,    0,    0,   72,    4,    0,    0,
         505,    0],
       [   0,    0,    0,    4,    0,    0,    2,    2,  103,    0,    0,
         861,    0],
       [   0,    0,    0,    1,    0,    0,    0,    0,    1,    0,    0,
         271,    0],
       [   1,    1,    0,    5,    0,    0,    0, 