In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Read the CSV files with error handling
try:
    final_reviews = pd.read_csv("/content/final_reviews.csv", quoting=3, on_bad_lines='skip')
    business_final = pd.read_csv("/content/business_final.csv")
except pd.errors.ParserError as e:
    print(f"Error reading CSV: {e}")
    exit()

# Debug: Inspect and clean business_id
print("Sample business_id from final_reviews:", final_reviews['business_id'].head().tolist())
print("Sample business_id from business_final:", business_final['business_id'].head().tolist())
print("Unique business_id in final_reviews:", final_reviews['business_id'].nunique())
print("Unique business_id in business_final:", business_final['business_id'].nunique())

# Clean business_id: Convert to string, strip whitespace, and handle NaN
final_reviews['business_id'] = final_reviews['business_id'].astype(str).str.strip()
business_final['business_id'] = business_final['business_id'].astype(str).str.strip()
final_reviews = final_reviews[final_reviews['business_id'].notna() & (final_reviews['business_id'] != 'nan') & (final_reviews['business_id'] != "'strange'")]

# Merge DataFrames
df = pd.merge(final_reviews, business_final, on='business_id', how="inner")
if df.empty:
    common_ids = set(final_reviews['business_id']).intersection(set(business_final['business_id']))
    print("Error: Merged DataFrame is empty. No matching business_ids found.")
    print("Common business_ids:", common_ids)
    print("If set is empty, check data integrity in final_reviews.csv and business_final.csv.")
    exit()
else:
    print("Merged DataFrame shape:", df.shape)
    print(df.head())

# Compute super_score
required_cols = ['stars_x', 'polarity', 'compound']
if all(col in df.columns for col in required_cols):
    df['super_score'] = df['stars_x'] + (df['polarity'] * df['compound'])
    print("super_score computed successfully.")
else:
    print("Error: Missing columns for super_score:", [col for col in required_cols if col not in df.columns])
    exit()

# Discretize super_score into categories
bins = [0, 3, 4.5, 6]  # Adjust as needed
labels = ['Low', 'Medium', 'High']
df['demand_category'] = pd.cut(df['super_score'], bins=bins, labels=labels, include_lowest=True)

# Step 1: Text preprocessing for CNN
if 'cleaned_text' in df.columns and not df['cleaned_text'].isna().all():
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(df['cleaned_text'].fillna(''))  # Fill NaN with empty string
    sequences = tokenizer.texts_to_sequences(df['cleaned_text'].fillna(''))
    maxlen = 50
    X_text = pad_sequences(sequences, maxlen=maxlen)
    if X_text.size == 0:
        print("Error: X_text is empty after preprocessing. Check cleaned_text data.")
        exit()
else:
    print("Error: 'cleaned_text' column missing or all NaN.")
    exit()

# Step 2: Build CNN for text feature extraction
def build_cnn_text():
    model = models.Sequential([
        layers.Embedding(input_dim=5000, output_dim=64, input_length=maxlen),
        layers.Conv1D(128, 5, activation='relu'),
        layers.MaxPooling1D(2),
        layers.Conv1D(64, 5, activation='relu'),
        layers.GlobalMaxPooling1D(),
        layers.Dense(32, activation='relu')
    ])
    return model

cnn = build_cnn_text()
X_text_features = cnn.predict(X_text, batch_size=32)  # Specify batch_size to avoid issues

# Step 3: Combine text features with numerical features
required_cols = ['super_score', 'latitude', 'longitude', 'review_count']
if all(col in df.columns for col in required_cols):
    X_num = df[required_cols].values
    X_combined = np.hstack((X_text_features, X_num))
else:
    print(f"Error: Missing columns: {[col for col in required_cols if col not in df.columns]}")
    exit()

# Step 4: Prepare target and split data
y = df['demand_category']
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Step 5: Train classifiers
classifiers = {
    "SVM": SVC(kernel='rbf', probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "RF": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {"predictions": y_pred, "accuracy": accuracy}

# Step 6: Generate business insights
def generate_business_insights(preds, classifier_name, accuracy, test_data):
    pred_df = pd.DataFrame({'business_id': df.iloc[test_data.index]['business_id'], 'prediction': preds})
    demand_dist = pred_df['prediction'].value_counts(normalize=True) * 100
    high_pct = demand_dist.get('High', 0.0)
    low_pct = demand_dist.get('Low', 0.0)
    insights = f"Using {classifier_name} (Accuracy: {accuracy:.2f}): {high_pct:.2f}% of restaurants predicted as High demand."
    recommendation = (
        f"Focus on promoting High demand restaurants to attract more customers. "
        f"Low demand ({low_pct:.2f}%) may need menu updates or marketing boosts."
    )
    top_high = pred_df[pred_df['prediction'] == 'High']['business_id'].head(5).tolist()
    return insights, recommendation, top_high

# Output results
for name, result in results.items():
    insights, rec, top_high = generate_business_insights(result["predictions"], name, result["accuracy"], X_test)
    print(f"\nClassifier: {name}")
    print("Insights:", insights)
    print("Recommendation:", rec)
    print("Top 5 High Demand Restaurants:", top_high)

Sample business_id from final_reviews: [" 'strange'", nan, nan, nan, nan]
Sample business_id from business_final: ['MTSW4McQd7CbVtyjqoe9mw', 'MUTTqe8uqyMdBl186RmNeA', 'ROeacJQwBeh05Rqg7F6TCg', 'aPNXGTDkf-4bjhyMBQxqpQ', 'ppFCk9aQkM338Rgwpl2F5A']
Unique business_id in final_reviews: 3878
Unique business_id in business_final: 3912
Error: Merged DataFrame is empty. No matching business_ids found.
Common business_ids: set()
If set is empty, check data integrity in final_reviews.csv and business_final.csv.
super_score computed successfully.
Error: 'cleaned_text' column missing or all NaN.


NameError: name 'maxlen' is not defined