In [1]:
!pip install pyspark



In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql import functions as F
from pyspark.sql.functions import broadcast
from pyspark.sql.functions import col, StringType
from pyspark.ml.feature import StringIndexer, VectorIndexer

In [4]:
# Create a Spark session
spark = SparkSession.builder.appName("rec").getOrCreate()


In [7]:
# Define schema for business_df
business_schema = StructType([
    StructField("business_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("address", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True),
    StructField("stars", FloatType(), True),
    StructField("review_count", IntegerType(), True),
    StructField("is_open", IntegerType(), True),
    StructField("attributes", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("hours", StringType(), True)
])

# Load the Yelp datasets with explicit schema or schema inference hints
file_path = "/content/drive/MyDrive/yelp_business1.csv"
business_df = spark.read.csv(file_path, header=True, schema=business_schema)
business_df.show()

+--------------------+--------------------+--------------------+--------------+-----+-----------+---------+-----------+-----+------------+-------+----------+--------------------+-----+
|         business_id|                name|             address|          city|state|postal_code| latitude|  longitude|stars|review_count|is_open|attributes|          categories|hours|
+--------------------+--------------------+--------------------+--------------+-----+-----------+---------+-----------+-----+------------+-------+----------+--------------------+-----+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...| Santa Barbara|   CA|      93101|34.426678|  -119.7112|  5.0|           7|      0|      True|Doctors, Traditio...| NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|        Affton|   MO|      63123|38.551125|  -90.33569|  3.0|          15|      1|      NULL|Shipping Centers,...| NULL|
|tUFrWirKiKi_TAnsV...|              Target|5255 E Broadway Blvd|        Tuc

In [11]:
filtered_businesses = business_df.filter((F.col("review_count") >= 20) & (F.col("stars") >= 2))

In [12]:
filtered_businesses = filtered_businesses.withColumnRenamed("stars", "filtered_business_stars")

In [13]:
review_df = spark.read.csv("/content/drive/MyDrive/ProcessedCSV/final_review.csv", header=True, inferSchema=True)
columns_to_drop = review_df.columns
review_df = review_df.filter(F.col(columns_to_drop[0]) != review_df.first()[columns_to_drop[0]])
for col in columns_to_drop[1:]:
    review_df = review_df.filter(F.col(col) != review_df.first()[col])
review_df.show()

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+-------------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|      date|               time|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+-------------------+
|5Vy-AFz0Ek1FF6s1a...|tbI8YO0O51Mrm_yIi...|4QjOsH83KEcjen94P...|    4|     8|    5|   9|This review is fo...|2016-03-07|2024-11-03 22:17:05|
|HsthKXBHUDCDOUH1z...|rytAuNSEgYf_xqkKZ...|j-qtdD55OLfSqfsWu...|    4|     1|    1|   0|Somehow we got ca...|2015-08-31|2024-11-03 20:11:08|
|zg5iA1-3eoozePgAS...|Q8V8JWWQNrpvD2bXC...|9cybDNxJ7cyNVkmwm...|    5|     0|    1|   0|Huge fan of this ...|2013-05-12|2024-11-03 01:44:00|
|FYj6Tx_Ft4VvkGFgD...|2lFni8ituUYceQJPx...|Iw8uqNPxviwcgxtru...|    1|     7|    5|   3|Things you should...|2010-12-05|2024-11-03 09:18:11|
|vmaErJz1v5tr

In [16]:
import pyspark.sql.functions as F

filtered_reviews = review_df.filter((F.col("stars") >= 2) )

In [17]:
# Perform the join if both DataFrames contain data
if filtered_reviews.count() > 0 and filtered_businesses.count() > 0:
    joined_df = filtered_reviews.join(filtered_businesses, on="business_id", how="inner")
    joined_df.show(10)
else:
    print("One or both DataFrames are empty, cannot perform join.")

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+-------------------+--------------------+--------------------+------------+-----+-----------+---------+-----------+-----------------------+------------+-------+----------+--------------------+-----+
|         business_id|           review_id|             user_id|stars|useful|funny|cool|                text|      date|               time|                name|             address|        city|state|postal_code| latitude|  longitude|filtered_business_stars|review_count|is_open|attributes|          categories|hours|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------+-------------------+--------------------+--------------------+------------+-----+-----------+---------+-----------+-----------------------+------------+-------+----------+--------------------+-----+
|--9osgUCSDUWUkoTL...|ItZuxtEiSOMDd0cRJ...|

In [18]:
filtered_reviews = filtered_reviews.withColumnRenamed("stars", "filtered_review_stars")

In [21]:
joined_df1 = filtered_reviews.join(broadcast(filtered_businesses), on="business_id", how="inner")

In [26]:
final_df = joined_df1.select(
    col("user_id").cast(StringType()),
    col("business_id").cast(StringType()),
    col("filtered_review_stars").cast(FloatType())  # Choose which column to use
)

In [29]:
# Create StringIndexer objects for user_id and business_id
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_indexed")
business_indexer = StringIndexer(inputCol="business_id", outputCol="business_id_indexed")

# Fit the indexers and transform the DataFrame
indexed_df = user_indexer.fit(final_df).transform(final_df)
indexed_df = business_indexer.fit(indexed_df).transform(indexed_df)

In [30]:
from pyspark.sql.functions import collect_list

# Extract the unique business IDs and their corresponding indexed values
business_id_mapping_df = indexed_df.select('business_id', 'business_id_indexed').distinct()

# Convert the DataFrame to a list of tuples
business_id_mapping_list = business_id_mapping_df.collect()

# Create a dictionary from the list of tuples
business_id_mapping_dict = {row['business_id']: row['business_id_indexed'] for row in business_id_mapping_list}

# Print the dictionary
print(business_id_mapping_dict)

{'NHb6QTrJOnyIj-Zd0fYzFg': 198.0, 'n5TNfoXg1i8wX5R3iY4WUA': 7514.0, 'PHqeN9MKNA_TfIUfmpBeRw': 9853.0, 'AOCwwM2f1X4-Ge4Ilv-X4g': 45963.0, '1rkmbnMjrYBM-hOckvnNFw': 15347.0, 'CUWituEeIX3xA0xU7X82Ow': 7109.0, 'XHxAIQR-QAz6j1RMaz_J1A': 19254.0, '6bpM6wCmdp6qZGTQbiR3-A': 30193.0, 'vI-2mPYOgfix5LcaqDOy_g': 11747.0, '7qL4Ech8XKYNvZb_JuP-Sw': 9572.0, 'lnEeMas32ddqNn06jOUGgQ': 42426.0, 'lPperLiGwJVEr9n8LfGZ5g': 13034.0, 'mmnneqm5rY6MNwF--uScYw': 34603.0, '2hIA-ubXzEoz4N5xQU1qHw': 17788.0, '_l2fthwLTqgLNM6budicQA': 33226.0, 'kyRUZEZThKotuabJQfuXYA': 3645.0, 'NnW-Ju9vAh4WRrWaNVPHBQ': 47936.0, 'IvAa7PvfOYl0xRML5W3ebg': 604.0, '8Vu3Iagcby29cZIijIk4Ng': 5409.0, 'VVMAvAVRpoVk6bd4avvYGw': 32668.0, 'vQQpyrhgqF5El4a_hNDdaA': 24142.0, 'JAaiz9eoJ3BSyQx-pVuX_Q': 15974.0, 'JJSxd_WYEP4qYaHPG_tQ7w': 15977.0, 'uCR5u51sWF2E9OByxo2Olw': 43518.0, 'gECyO9S9P3wQEgVZIjR49g': 33904.0, 'VU_IdfezeFylMEeBuJGzkQ': 49037.0, 'kokdzLxHKyxfNv5SEz0klw': 42316.0, 'bAAN8_3lDA0baaYQ2bbK1w': 41049.0, 'CNDFASo4BpgATitxqUfkLA': 377

In [47]:


def find_business_key(business_id_mapping_dict, value):
  """
  Finds the business key corresponding to a value in the dictionary.

  Args:
    business_id_mapping_dict: The dictionary mapping business IDs to indexed values.
    value: The value to search for.

  Returns:
    The business key (business_id) corresponding to the value, or None if not found.
  """
  for business_id, indexed_value in business_id_mapping_dict.items():
    if indexed_value == value:
      return business_id
  return None


# Example usage:
value_to_find = 2590.0  # Replace with the value to find
business_id = find_business_key(business_id_mapping_dict, value_to_find)
if business_id:
  print(f"The business ID for value {value_to_find} is: {business_id}")
else:
  print(f"No business ID found for value {value_to_find}") #

The business ID for value 2590.0 is: W57360g1e7xMGerm7wPW0Q


In [31]:
# writing the json

import json
# from google.colab import drive
# drive.mount('/content/drive')


# Specify the file path
file_path = "/content/drive/MyDrive/Models/businessmapping.json"

# Write the dictionary to the JSON file
with open(file_path, 'w') as f:
  json.dump(business_id_mapping_dict, f)

print(f"Dictionary written to {file_path}")

Dictionary written to /content/drive/MyDrive/Models/businessmapping.json


In [34]:
# loading the saved model

import pickle

# Load the saved model from the specified path
with open('/content/drive/MyDrive/Models/random_forest_model2.pkl', 'rb') as f:
  loaded_model = pickle.load(f)


In [38]:
import os
import pandas as pd

file_path = '/content/drive/MyDrive/ProcessedCSV/features_df.csv'

# Get a list of all part files in the directory
part_files = [os.path.join(file_path, f) for f in os.listdir(file_path) if f.endswith('.csv')]

# Concatenate all part files into a single DataFrame
df = pd.concat([pd.read_csv(f) for f in part_files], ignore_index=True)


print(df.head())  # Display the first few rows

              business_id                                     name  \
0  NQhyMw8SOU1HB-V9X52CTg  Champion Chevrolet - Service Department   
1  2y_CdkxEOJEJGyJApfCYpA      Rode's Fireside Restaurant & Tavern   
2  8PNKnlnJg6snf-HUgFaNIw                             U Dirty Dawg   
3  sk2lZI4zmuGAccd3DLCnBw                Boyd Hill Nature Preserve   
4  83IeQHroXEctmMpK19TJKw           The Richel D'Ambra Spa & Salon   

                   address              city state postal_code   latitude  \
0           800 Kietzke Ln              Reno    NV       89502  39.518223   
1            533 Kings Hwy      Woolwich Twp    NJ       08085  39.753252   
2      2435 S State Rd 135         Greenwood    IN       46143  39.579628   
3  1101 Country Club Way S  Saint Petersburg    FL       33705  27.725123   
4    10 Avenue of the Arts      Philadelphia    PA       19102  39.951298   

    longitude  business_stars  review_count  is_open  \
0 -119.787714             3.0            45        1   
1  -

In [48]:
feature_columns = ["business_stars", "review_count", "avg_sentiment", "avg_review_stars", "total_reviews", "total_checkin_count"]


def predict_success_for_business_ids(business_ids, model, features_df, feature_columns):
    # Filter the dataset for the provided business IDs
    input_data = features_df[features_df["business_id"].isin(business_ids)]
    X_input = input_data[feature_columns]

    # Predict success probability
    success_probabilities = model.predict(X_input)

    # Create a DataFrame to display results
    result_df = pd.DataFrame({
        "business_id": input_data["business_id"],
        "success_probability": success_probabilities*10000
    })

    return result_df

# Example usage
business_ids_to_predict = ['IZAMcBLQ365GsrgonoM79w']
success_probabilities = predict_success_for_business_ids(business_ids_to_predict, loaded_model, df, feature_columns)

print("Predicted Success Probabilities:")
print(success_probabilities)

Predicted Success Probabilities:
                  business_id  success_probability
1946   W57360g1e7xMGerm7wPW0Q           189.433630
9016   R46XVcmUzy8qeerHyZQtEg           127.708652
11495  IZAMcBLQ365GsrgonoM79w            34.489153
21795  MnjCw3oaM0Hq39bjONDU0Q           102.687166
32206  GAXuzDNbzfZbtZYxqrdHnw           134.529491


In [49]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Define feature columns
feature_columns = ["business_stars", "review_count", "avg_sentiment", "avg_review_stars", "total_reviews", "total_checkin_count"]

def predict_success_for_business_ids(business_ids, model, features_df, feature_columns):
    # Filter the dataset for the provided business IDs
    input_data = features_df[features_df["business_id"].isin(business_ids)]
    X_input = input_data[feature_columns]

    # Predict success values (not necessarily probabilities yet)
    success_values = model.predict(X_input).reshape(-1, 1)

    # Use Min-Max Scaler to transform values into a probability range (0-100%)
    scaler = MinMaxScaler(feature_range=(0, 100))
    success_probabilities = scaler.fit_transform(success_values).flatten()

    # Create a DataFrame to display results
    result_df = pd.DataFrame({
        "business_id": input_data["business_id"].values,
        "success_probability": success_probabilities
    })

    return result_df

# Example usage
business_ids_to_predict = ['MnjCw3oaM0Hq39bjONDU0Q','R46XVcmUzy8qeerHyZQtEg', 'GAXuzDNbzfZbtZYxqrdHnw', 'IZAMcBLQ365GsrgonoM79w', 'W57360g1e7xMGerm7wPW0Q']
success_probabilities = predict_success_for_business_ids(business_ids_to_predict, loaded_model, df, feature_columns)

print("Predicted Success Probabilities:")
print(success_probabilities)


Predicted Success Probabilities:
              business_id  success_probability
0  W57360g1e7xMGerm7wPW0Q           100.000000
1  R46XVcmUzy8qeerHyZQtEg            60.163164
2  IZAMcBLQ365GsrgonoM79w             0.000000
3  MnjCw3oaM0Hq39bjONDU0Q            44.014485
4  GAXuzDNbzfZbtZYxqrdHnw            64.565282
