In [10]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T


spark=(
    SparkSession.builder.
    appName("Recipes ML Model - Are you a dessert?").
    config("spark.driver.memory","8g").
    getOrCreate()
)

spark

In [15]:
food=spark.read.csv("epi_r.csv",inferSchema=True, header=True)

In [17]:
food.printSchema()

root
 |-- title: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- calories: string (nullable = true)
 |-- protein: double (nullable = true)
 |-- fat: double (nullable = true)
 |-- sodium: double (nullable = true)
 |-- #cakeweek: double (nullable = true)
 |-- #wasteless: double (nullable = true)
 |-- 22-minute meals: double (nullable = true)
 |-- 3-ingredient recipes: double (nullable = true)
 |-- 30 days of groceries: double (nullable = true)
 |-- advance prep required: double (nullable = true)
 |-- alabama: double (nullable = true)
 |-- alaska: double (nullable = true)
 |-- alcoholic: double (nullable = true)
 |-- almond: double (nullable = true)
 |-- amaretto: double (nullable = true)
 |-- anchovy: double (nullable = true)
 |-- anise: double (nullable = true)
 |-- anniversary: double (nullable = true)
 |-- anthony bourdain: double (nullable = true)
 |-- aperitif: double (nullable = true)
 |-- appetizer: double (nullable = true)
 |-- apple: double (nullable = true)


In [18]:
food.count()

20057

In [20]:
len(food.columns)

680

In [40]:
def sanitize_coloumn_name(name):

    answer = name
    
    for i,j in (
        (" ","_",),
        ("-","_",),
        ("/","_",),
        ("&","and",),
    ):
        answer = answer.replace(i,j)

    return "".join(
        [char for char in answer if char.isalpha() or char.isdigit() or char == "_"]
    )

food=food.toDF(*[sanitize_coloumn_name(name) for name in food.columns])



In [41]:
food.printSchema()

root
 |-- title: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- calories: string (nullable = true)
 |-- protein: double (nullable = true)
 |-- fat: double (nullable = true)
 |-- sodium: double (nullable = true)
 |-- cakeweek: double (nullable = true)
 |-- wasteless: double (nullable = true)
 |-- 22_minute_meals: double (nullable = true)
 |-- 3_ingredient_recipes: double (nullable = true)
 |-- 30_days_of_groceries: double (nullable = true)
 |-- advance_prep_required: double (nullable = true)
 |-- alabama: double (nullable = true)
 |-- alaska: double (nullable = true)
 |-- alcoholic: double (nullable = true)
 |-- almond: double (nullable = true)
 |-- amaretto: double (nullable = true)
 |-- anchovy: double (nullable = true)
 |-- anise: double (nullable = true)
 |-- anniversary: double (nullable = true)
 |-- anthony_bourdain: double (nullable = true)
 |-- aperitif: double (nullable = true)
 |-- appetizer: double (nullable = true)
 |-- apple: double (nullable = true)
 |

In [46]:
for x in food.columns:
    food.select(x).summary().show()

+-------+--------------------+
|summary|               title|
+-------+--------------------+
|  count|               20057|
|   mean|                NULL|
| stddev|                NULL|
|    min|                B...|
|    25%|                NULL|
|    50%|                NULL|
|    75%|                NULL|
|    max|Zuppa di Cavolo N...|
+-------+--------------------+

+-------+--------------------+
|summary|              rating|
+-------+--------------------+
|  count|               20052|
|   mean|   3.714460295291301|
| stddev|  1.3409187660508957|
|    min| Aged Balsamic Vi...|
|    25%|                3.75|
|    50%|               4.375|
|    75%|               4.375|
|    max|                 5.0|
+-------+--------------------+

+-------+------------------+
|summary|          calories|
+-------+------------------+
|  count|             15936|
|   mean|6323.2701063763025|
| stddev|359057.30637558916|
|    min|       and Lemon "|
|    25%|             198.0|
|    50%|             

In [54]:
food.select(F.size(F.collect_set('turkey')) == 2).show()

+-------------------------------+
|(size(collect_set(turkey)) = 2)|
+-------------------------------+
|                           true|
+-------------------------------+



In [65]:
binary=food.agg(*[(F.size(F.collect_set(x)) == 2).alias(x) for x in food.columns])

In [68]:
binary.select('title','turkey').show()

+-----+------+
|title|turkey|
+-----+------+
|false|  true|
+-----+------+



In [71]:
binary=binary.toPandas()

In [74]:
import pandas as pd
pd.set_option("display.max_rows",1000)

In [75]:
binary.unstack()

title                     0    False
rating                    0    False
calories                  0    False
protein                   0    False
fat                       0    False
sodium                    0    False
cakeweek                  0    False
wasteless                 0    False
22_minute_meals           0     True
3_ingredient_recipes      0     True
30_days_of_groceries      0     True
advance_prep_required     0     True
alabama                   0     True
alaska                    0     True
alcoholic                 0     True
almond                    0     True
amaretto                  0     True
anchovy                   0     True
anise                     0     True
anniversary               0     True
anthony_bourdain          0     True
aperitif                  0     True
appetizer                 0     True
apple                     0     True
apple_juice               0     True
apricot                   0     True
arizona                   0     True
a