<a href="https://colab.research.google.com/github/animesh-banik/DataScienceProject_Databriks/blob/Google_Colub/DataAnalysis_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# _______________________________Zomato Data Analysis Using Spark

In [5]:
from pyspark.sql.types import *
import pandas as pd
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName("DataEngineeringExample").getOrCreate()

# Create DataFrame using Pandas first, then convert to Spark
df = pd.read_csv('/content/sample_data/Zomato-data.txt')

In [8]:
# _____________________________________Define schema for Spark DataFrame
schema = StructType([
        StructField("name", StringType(), True),
        StructField("online_order", StringType(), True),
        StructField("book_table", StringType(), True),
        StructField("rate", StringType(), True),
        StructField("votes", IntegerType(), True),
        StructField("approx_cost", IntegerType(), True),
        StructField("restaurant_type", StringType(), True)
    ])

print(f"Original Dataset Shape: {df.count()} rows, {len(df.columns)} columns")

Original Dataset Shape: name                           22
online_order                   22
book_table                     22
rate                           22
votes                          22
approx_cost(for two people)    22
listed_in(type)                22
dtype: int64 rows, 7 columns


In [9]:

# ____________________________________Convert to Spark DataFrame
df_spark = spark.createDataFrame(df, schema)
print(df_spark.show(10))

print(f"Original Dataset Shape: {df.count()} rows, {len(df.columns)} columns")

+--------------------+------------+----------+-----+-----+-----------+---------------+
|                name|online_order|book_table| rate|votes|approx_cost|restaurant_type|
+--------------------+------------+----------+-----+-----+-----------+---------------+
|               Jalsa|         Yes|       Yes|4.1/5|  775|        800|         Buffet|
|      Spice Elephant|         Yes|        No|4.1/5|  787|        800|         Buffet|
|     San Churro Cafe|         Yes|        No|3.8/5|  918|        800|         Buffet|
|Addhuri Udupi Bho...|          No|        No|3.7/5|   88|        300|         Buffet|
|       Grand Village|          No|        No|3.8/5|  166|        600|         Buffet|
|     Timepass Dinner|         Yes|        No|3.8/5|  286|        600|         Buffet|
|Rosewood Internat...|          No|        No|3.6/5|    8|        800|         Buffet|
|              Onesta|         Yes|       Yes|4.6/5| 2556|        600|          Cafes|
|      Penthouse Cafe|         Yes|        

In [12]:
# ____________________________________________Data preprocessing
from pyspark.sql.functions import regexp_extract, col, when
df_clean = (
    df_spark.withColumn("rate", col("rate").cast("string"))
      .withColumn("rating_numeric", regexp_extract(col("rate"), r"(\d+\.\d+)", 1).cast("double"))
      .withColumn("online_order_binary", when(col("online_order") == "Yes", 1).otherwise(0))
      .withColumn("book_table_binary", when(col("book_table") == "Yes", 1).otherwise(0))
      .withColumnRenamed("approx_cost(for two people)", "cost_for_two")
      .withColumnRenamed("listed_in(type)", "restaurant_type")
      .filter(col("rating_numeric").isNotNull())
      .dropDuplicates()
)
df_clean.select("name","online_order_binary",
                "book_table_binary","rating_numeric",
                "votes","approx_cost","restaurant_type").show(10)

+--------------------+-------------------+-----------------+--------------+-----+-----------+---------------+
|                name|online_order_binary|book_table_binary|rating_numeric|votes|approx_cost|restaurant_type|
+--------------------+-------------------+-----------------+--------------+-----+-----------+---------------+
|     Timepass Dinner|                  1|                0|           3.8|  286|        600|         Buffet|
|Rosewood Internat...|                  0|                0|           3.6|    8|        800|         Buffet|
|               Jalsa|                  1|                1|           4.1|  775|        800|         Buffet|
|     San Churro Cafe|                  1|                0|           3.8|  918|        800|         Buffet|
|Addhuri Udupi Bho...|                  0|                0|           3.7|   88|        300|         Buffet|
|      Penthouse Cafe|                  1|                0|           4.0|  324|        700|          other|
|       Gr

In [None]:
#_______________________________________SPARK SQL ANALYTICS______________________


    df_clean.createOrReplaceTempView("restaurants")