In [1]:
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
spark = (SparkSession.builder.appName("Day 8 question 2").master("local[1]").enableHiveSupport().getOrCreate())

# Data Preparation

<h6>Analysis yelp data of American Business Reviews</h6>

<b>Context.</b>

This dataset is a subset of Yelps businesses, reviews and user data. It was originally put together for the Yelp Dataset Challenge which is a chance for students to conduct research or analysis on Yelps data and share their discoveries. In the most recent dataset you will find information about businesses across 8 metropolitan areas in the USA and Canada.

<b>Content: </b>
This dataset contains five JSON files and the user agreement.

<b>Dataset: </b>
yelp_academic_dataset_business.json
<br>

In [3]:
spark.sql("CREATE DATABASE IF NOT EXISTS day8")
spark.sql("USE day8")

DataFrame[]

In [5]:
spark.conf.set("spark.sql.caseSensitive", "true")

In [6]:
spark.sql("CREATE TEMPORARY VIEW business USING json OPTIONS" + 
      " (path 'C:/Users/Lenovo/Documents/yelp_academic_dataset_business.json')")

DataFrame[]

In [7]:
spark.sql("SELECT distinct city from business").show()

+--------------------+
|                city|
+--------------------+
|               Ratho|
|               Tempe|
|        Fountain Hls|
|          Harrisburg|
|      Île des Soeurs|
|         Wilkinsburg|
|     North Las Vegas|
|         C Las Vegas|
|               Mesa |
|        Saint-Hubert|
|       Lawrenceville|
|             Phoenix|
|        West Mifflin|
|               Savoy|
|        Mckees Rocks|
| Pointe-Aux-Trembles|
|Bonnyrigg and Las...|
|     South Las Vegas|
|            Rosemère|
|           Pittsburg|
+--------------------+
only showing top 20 rows



### `1.	How many types of businesses are there?`

In [8]:
spark.sql("select count(distinct name) as Total_Business from business").show()

+--------------+
|Total_Business|
+--------------+
|         45694|
+--------------+



### `2.	What are the three cities with the largest number of businesses?`

In [9]:
spark.sql("""select city as City, count(distinct business_id) as Largest_No_Business
from business 
group by city
order by count(distinct business_id) desc
""").show(3, truncate=False)

+---------+-------------------+
|City     |Largest_No_Business|
+---------+-------------------+
|Las Vegas|13601              |
|Phoenix  |8410               |
|Charlotte|4224               |
+---------+-------------------+
only showing top 3 rows



### `3.	What are the types of businesses in each city?`

In [10]:
spark.sql("""select name as Business_Type, city as City
from business
group by name, city
order by city asc""").show(truncate=False)

+-------------------------------------+--------------+
|Business_Type                        |City          |
+-------------------------------------+--------------+
|Ahwatukee Foothills Storage          |1023 E Frye Rd|
|McDonald's                           |Ahwatukee     |
|Kathy's Alterations                  |Ahwatukee     |
|Hertz Rent A Car                     |Ahwatukee     |
|U-Haul                               |Ahwatukee     |
|Active Kids Pediatrics               |Ahwatukee     |
|Healthy and Clean Living Environments|Ahwatukee     |
|Hi-Health                            |Ahwatukee     |
|Cupz N' Crepes                       |Ahwatukee     |
|My Wine Cellar                       |Ahwatukee     |
|Desert Dog Pet Care                  |Ahwatukee     |
|West End Overlook                    |Allegheny     |
|Wild Rice                            |Allentown     |
|Quality Hotel                        |Anjou         |
|Zibo Anjou                           |Anjou         |
|Boston Pi

### `4.	What are the top three business types in each city?`

In [11]:
spark.sql("""select name as Business_Type, city as City, rank as Top_3
from (
select *, dense_rank() over (partition by city order by name desc) as rank 
from business)
where rank <= 3""").show(truncate=False)

+------------------------------------------------------+---------------+-----+
|Business_Type                                         |City           |Top_3|
+------------------------------------------------------+---------------+-----+
|Bridge Inn Ratho                                      |Ratho          |1    |
|World of Nails                                        |Fountain Hls   |1    |
|Second Look Books                                     |Harrisburg     |1    |
|Richie's Nypd                                         |Harrisburg     |2    |
|QuikTrip                                              |Harrisburg     |3    |
|uBreakiFix                                            |Tempe          |1    |
|lumenbrite - Adobe Authorized Training in Phoenix Area|Tempe          |2    |
|iSolution Repairs, LLC                                |Tempe          |3    |
|Strøm                                                 |Île des Soeurs |1    |
|Lowe's Home Improvement                            

### `5.	What are the three business categories with the most consumer comments?`

In [12]:
spark.sql("""select categories as Business_Categories, count(review_count) as Customer_Comments
from business
group by categories
order by count(review_count) desc
""").show(3, truncate=False)

+----------------------------------------------------+-----------------+
|Business_Categories                                 |Customer_Comments|
+----------------------------------------------------+-----------------+
|[Mexican, Restaurants]                              |1481             |
|[Pizza, Restaurants]                                |1068             |
|[Hotels & Travel, Event Planning & Services, Hotels]|982              |
+----------------------------------------------------+-----------------+
only showing top 3 rows



### `6.	What are the three most popular business categories in each city?`

In [18]:
pd.set_option('display.max_colwidth', 100)
spark.sql("""select categories as Top_3_Categories, city as City, rank as Rank
from (
select *, dense_rank() over (partition by city order by categories desc) as rank 
from business)
where rank <= 3""").toPandas()

Unnamed: 0,Top_3_Categories,City,Rank
0,"[Bars, Bed & Breakfast, Pubs, Nightlife, Hotels & Travel, Gastropubs, Restaurants]",Ratho,1
1,"[Beauty & Spas, Nail Salons]",Fountain Hls,1
2,"[Seafood, Southern, Chicken Wings, Restaurants]",Harrisburg,1
3,"[Seafood, Restaurants]",Harrisburg,2
4,"[Restaurants, Pizza, Italian]",Harrisburg,3
...,...,...,...
848,"[Landmarks & Historical Buildings, Public Services & Government]",Rankin,1
849,"[Banks & Credit Unions, Financial Services]",Rankin,2
850,"[Women's Clothing, Men's Clothing, Fashion, Shopping, Children's Clothing]",Homestead,1
851,"[Women's Clothing, Department Stores, Fashion, Shopping, Men's Clothing]",Homestead,2


### `7.	What is the total number of comments on each business type in each city?`

In [14]:
spark.sql("""select name as Business_Type, city as City, sum(review_count) as Total_Comment
from business
group by name, city
order by sum(review_count) desc""").toPandas()

Unnamed: 0,Business_Type,City,Total_Comment
0,Hash House A Go Go,Las Vegas,4796
1,Mon Ami Gabi,Las Vegas,4578
2,Earl of Sandwich,Las Vegas,4144
3,Wicked Spoon,Las Vegas,3828
4,The Buffet,Las Vegas,3328
...,...,...,...
52902,Patty's Ultimate Sports Massage & Deep Tissue Therapy,Tempe,3
52903,Aracruz International Granite,Phoenix,3
52904,Black Box,Edinburgh,3
52905,Shepherd Eye Center,Henderson,3


### `8.	What is the average score of each business type?`

In [15]:
spark.sql("""select name as Business_Type, avg(stars) as Average_Score
from business
group by name
order by avg(stars) desc""").toPandas()

Unnamed: 0,Business_Type,Average_Score
0,Bricks Studio,5.0
1,Maricopa Mountain Plumbing,5.0
2,Chiropractic Health Innovations DC,5.0
3,West USA Realty,5.0
4,BOX,5.0
...,...,...
45689,Maid 4 It All,1.0
45690,Ticketnetwork,1.0
45691,TruGreen,1.0
45692,Hy-Bar Windows & Doors,1.0


### `9.	What is the average score of each business type in each city?'`

In [16]:
spark.sql("""select name, city, avg(stars) as Average_Score
from business
group by name, city
order by avg(stars)""").toPandas()

Unnamed: 0,name,city,Average_Score
0,O48 Realty,Las Vegas,1.0
1,Master Mobile Mechanics,Las Vegas,1.0
2,Boba Deli,Tempe,1.0
3,Senzala,Montréal,1.0
4,Children's Medical Group Ltd,Peoria,1.0
...,...,...,...
52902,Sin City Kiss in Compendium,Las Vegas,5.0
52903,JesterZ Improv Comedy,Mesa,5.0
52904,Patty's Ultimate Sports Massage & Deep Tissue Therapy,Tempe,5.0
52905,Aracruz International Granite,Phoenix,5.0


### `10.	Which of the three business types with the highest average score in each city?`

In [17]:
spark.sql("""select name as Business_Type, city as City, round(avg(stars) over (partition by city),2) as Average_Score, rank as Rank
from (
select *, row_number() over (partition by city order by name desc) as rank 
from business)
where rank <= 3""").show(truncate=False)

+------------------------------------------------------+---------------+-------------+----+
|Business_Type                                         |City           |Average_Score|Rank|
+------------------------------------------------------+---------------+-------------+----+
|Bridge Inn Ratho                                      |Ratho          |3.5          |1   |
|World of Nails                                        |Fountain Hls   |3.5          |1   |
|Second Look Books                                     |Harrisburg     |4.33         |1   |
|Richie's Nypd                                         |Harrisburg     |4.33         |2   |
|QuikTrip                                              |Harrisburg     |4.33         |3   |
|uBreakiFix                                            |Tempe          |4.67         |1   |
|lumenbrite - Adobe Authorized Training in Phoenix Area|Tempe          |4.67         |2   |
|iSolution Repairs, LLC                                |Tempe          |4.67    