In [1]:
import requests
import pandas as pd
import time
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score


In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
headers = {
    "X-RapidAPI-Key": "5661a6fd14mshf84b2420bbee2b0p178ea5jsnf730fdf5e8ef",
    "X-RapidAPI-Host": "local-business-data.p.rapidapi.com"
}

params = {
    "query": "coffee shop",
    "lat": "40.730610",
    "lng": "-73.935242",
    "radius": "1000",
    "limit": "5",
    "language": "en",
    "zoom": "14"
}

url = "https://local-business-data.p.rapidapi.com/search-in-area"
response = requests.get(url, headers=headers, params=params)

if response.status_code == 200:
    data = response.json()
    location_df = pd.DataFrame(data.get("data", []))
else:
    print("Failed to fetch location data")


In [4]:
all_reviews_df = pd.DataFrame()

if not location_df.empty and 'business_id' in location_df.columns:
    for business_id in location_df['business_id']:
        r = requests.get(
            "https://local-business-data.p.rapidapi.com/business-reviews",
            headers=headers,
            params={"business_id": business_id}
        )
        if r.status_code == 200:
            temp = pd.DataFrame(r.json().get("data", []))
            all_reviews_df = pd.concat([all_reviews_df, temp], ignore_index=True)
        time.sleep(1)


  all_reviews_df = pd.concat([all_reviews_df, temp], ignore_index=True)
  all_reviews_df = pd.concat([all_reviews_df, temp], ignore_index=True)


In [5]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = text.lower()
        return ' '.join(word for word in text.split() if word not in stop_words)
    return ""

all_reviews_df['cleaned_review_text'] = all_reviews_df['review_text'].apply(clean_text)


In [10]:
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    tokens = nltk.word_tokenize(text, language='english')
    return " ".join(lemmatizer.lemmatize(word) for word in tokens)

all_reviews_df['lemmatized_review_text'] = all_reviews_df['cleaned_review_text'].apply(lemmatize)

In [7]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(all_reviews_df['lemmatized_review_text'])


In [12]:
from io import StringIO
import json

tags_data = """name,review,tags
Coffee Bar,"Great coffee and quiet ambience","['good-coffee', 'quiet']"
Star Cafe,"Very affordable and dog-friendly","['affordable', 'pet-friendly']"
FastBrew,"Crowded but friendly staff","['crowded', 'friendly']"
"""

tags_df = pd.read_csv(StringIO(tags_data))
tags_df['tags'] = tags_df['tags'].apply(lambda x: json.loads(x.replace("'", '"')))

reviews_sample = all_reviews_df.head(tags_df.shape[0]).copy()
reviews_sample['tags'] = tags_df['tags']


In [13]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(reviews_sample['tags'])


In [15]:
X_demo = tfidf.transform(reviews_sample['lemmatized_review_text'])

model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))
model.fit(X_demo, y)
