In [52]:
import pyspark
import pyspark.sql.functions as F
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.stat import Correlation

#### Requêtes SQL 

In [53]:
print(pyspark.__version__)

3.2.1


In [54]:
conf = pyspark.SparkConf().set("spark.jars.packages", 
                                "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1").setMaster("local").setAppName("My App").setAll([("spark.driver.memory", "40g"), ("spark;executor.memory", "50g")])

In [55]:
sc = SparkContext(conf=conf) #Connected to pyspark

In [56]:
sqlC = SQLContext(sc)



In [57]:
mongo_ip = "mongodb://localhost:27017/restaurantsdb."

In [58]:
restaurants = sqlC.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", mongo_ip + "restaurants").load()

DataFrame[_id: struct<oid:string>, address: struct<building:string,coord:array<double>,street:string,zipcode:string>, borough: string, cuisine: string, grades: array<struct<date:timestamp,grade:string,score:int>>, name: string, restaurant_id: string]

In [59]:
restaurants.createOrReplaceTempView("restaurants")

In [60]:
restaurants = sqlC.sql("SELECT * FROM restaurants")

In [61]:
restaurants.show()

+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+-------------+
|                 _id|             address|      borough|             cuisine|              grades|                name|restaurant_id|
+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+-------------+
|{61f15a93e0cb0c13...|{1007, [-73.85607...|        Bronx|              Bakery|[{2014-03-03 01:0...|Morris Park Bake ...|     30075445|
|{61f15a93e0cb0c13...|{469, [-73.961704...|     Brooklyn|          Hamburgers|[{2014-12-30 01:0...|             Wendy'S|     30112340|
|{61f15a93e0cb0c13...|{351, [-73.985135...|    Manhattan|               Irish|[{2014-09-06 02:0...|Dj Reynolds Pub A...|     30191841|
|{61f15a93e0cb0c13...|{2780, [-73.98241...|     Brooklyn|           American |[{2014-06-10 02:0...|     Riviera Caterer|     40356018|
|{61f15a93e0cb0c13...|{97-22, [-73.8601...|       Queen

In [62]:
# Nombre de données dans la table
restaurants = sqlC.sql("SELECT COUNT(*) FROM restaurants")
restaurants.show()

+--------+
|count(1)|
+--------+
|    3772|
+--------+



In [63]:
# Nombre de restaurants par code postal
restaurants = sqlC.sql("SELECT address.zipcode, COUNT(restaurant_id) FROM restaurants GROUP BY address.zipcode")
restaurants.show()

+-------+--------------------+
|zipcode|count(restaurant_id)|
+-------+--------------------+
|  11205|                  10|
|  11236|                  11|
|  10309|                  13|
|  11106|                  27|
|  11218|                  16|
|  10452|                  11|
|  11428|                   5|
|  11237|                   9|
|  11379|                  11|
|  11364|                  10|
|  11249|                   8|
|  10012|                  94|
|  11001|                   2|
|  11385|                  34|
|  11238|                  12|
|  10039|                   4|
|  11427|                   5|
|  11367|                  10|
|  10010|                  27|
|  10038|                  18|
+-------+--------------------+
only showing top 20 rows



In [64]:
# Nombre de restaurants par type de cuisine
restaurants = sqlC.sql("SELECT cuisine, COUNT(restaurant_id) FROM restaurants GROUP BY cuisine")
restaurants.show()

+----------------+--------------------+
|         cuisine|count(restaurant_id)|
+----------------+--------------------+
|Pancakes/Waffles|                   7|
|Chinese/Japanese|                   1|
|         Mexican|                  73|
|   Jewish/Kosher|                  60|
|          Bakery|                 127|
|         Turkish|                  11|
|        Armenian|                   1|
|         Hotdogs|                   4|
|       Ethiopian|                   3|
|            Thai|                  14|
|          Indian|                  43|
|         Chinese|                 115|
|      Indonesian|                   2|
|       Soul Food|                   6|
|     Continental|                   8|
|           Steak|                  21|
|         African|                   4|
|CafÃ©/Coffee/Tea|                   1|
|          Donuts|                  43|
|           Tapas|                   4|
+----------------+--------------------+
only showing top 20 rows



In [65]:
# Le restaurant le mieux noté
from pyspark.sql.functions import *

restaurants = sqlC.sql("select *,aggregate(grades.score,0,(x,y) -> x + y) as sum from restaurants")
restaurants.show()

+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+-------------+---+
|                 _id|             address|      borough|             cuisine|              grades|                name|restaurant_id|sum|
+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+-------------+---+
|{61f15a93e0cb0c13...|{1007, [-73.85607...|        Bronx|              Bakery|[{2014-03-03 01:0...|Morris Park Bake ...|     30075445| 41|
|{61f15a93e0cb0c13...|{469, [-73.961704...|     Brooklyn|          Hamburgers|[{2014-12-30 01:0...|             Wendy'S|     30112340| 55|
|{61f15a93e0cb0c13...|{351, [-73.985135...|    Manhattan|               Irish|[{2014-09-06 02:0...|Dj Reynolds Pub A...|     30191841| 37|
|{61f15a93e0cb0c13...|{2780, [-73.98241...|     Brooklyn|           American |[{2014-06-10 02:0...|     Riviera Caterer|     40356018| 36|
|{61f15a93e0cb0c13...|{97-2

### Linear Regression with PySpark

In [66]:
restaurants_df = sqlC.read.format("com.mongodb.spark.sql.DefaultSource").option("uri", mongo_ip + "restaurants").load()

In [67]:
restaurants_df.show()

+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+-------------+
|                 _id|             address|      borough|             cuisine|              grades|                name|restaurant_id|
+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+-------------+
|{61f15a93e0cb0c13...|{1007, [-73.85607...|        Bronx|              Bakery|[{2014-03-03 01:0...|Morris Park Bake ...|     30075445|
|{61f15a93e0cb0c13...|{469, [-73.961704...|     Brooklyn|          Hamburgers|[{2014-12-30 01:0...|             Wendy'S|     30112340|
|{61f15a93e0cb0c13...|{351, [-73.985135...|    Manhattan|               Irish|[{2014-09-06 02:0...|Dj Reynolds Pub A...|     30191841|
|{61f15a93e0cb0c13...|{2780, [-73.98241...|     Brooklyn|           American |[{2014-06-10 02:0...|     Riviera Caterer|     40356018|
|{61f15a93e0cb0c13...|{97-22, [-73.8601...|       Queen

In [68]:
restaurants_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- building: string (nullable = true)
 |    |-- coord: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |    |-- street: string (nullable = true)
 |    |-- zipcode: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- cuisine: string (nullable = true)
 |-- grades: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- date: timestamp (nullable = true)
 |    |    |-- grade: string (nullable = true)
 |    |    |-- score: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- restaurant_id: string (nullable = true)



In [69]:
restaurants_df.describe().show()

+-------+-------------+--------------------+--------------------+-------------------+
|summary|      borough|             cuisine|                name|      restaurant_id|
+-------+-------------+--------------------+--------------------+-------------------+
|  count|         3772|                3772|                3772|               3772|
|   mean|         null|                null|                null|4.056896984093319E7|
| stddev|         null|                null|                null| 343183.48925093777|
|    min|        Bronx|              Afghan|(Lewis Drug Store...|           30075445|
|    max|Staten Island|Vietnamese/Cambod...|      Zum Stammtisch|           40900694|
+-------+-------------+--------------------+--------------------+-------------------+



In [70]:
# 