In [2]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

In [3]:
pizza_dataset_path = "../datasets/pizza_sales/order_details.csv"

In [4]:
spark = SparkSession.builder.appName('Learning Spark').getOrCreate()
spark

In [9]:
data = spark.read.option('header', 'true').csv(pizza_dataset_path, inferSchema=True)

In [10]:
data.printSchema()

root
 |-- order_details_id: integer (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- pizza_id: string (nullable = true)
 |-- quantity: integer (nullable = true)



In [12]:
data.columns

['order_details_id', 'order_id', 'pizza_id', 'quantity']

In [14]:
data.head(3)

[Row(order_details_id=1, order_id=1, pizza_id='hawaiian_m', quantity=1),
 Row(order_details_id=2, order_id=2, pizza_id='classic_dlx_m', quantity=1),
 Row(order_details_id=3, order_id=2, pizza_id='five_cheese_l', quantity=1)]

# How to get an element 

In [20]:
data.select(['pizza_id', 'order_details_id']).show()


+--------------+----------------+
|      pizza_id|order_details_id|
+--------------+----------------+
|    hawaiian_m|               1|
| classic_dlx_m|               2|
| five_cheese_l|               3|
|   ital_supr_l|               4|
|    mexicana_m|               5|
|    thai_ckn_l|               6|
|   ital_supr_m|               7|
|  prsc_argla_l|               8|
|   ital_supr_m|               9|
|   ital_supr_m|              10|
|     bbq_ckn_s|              11|
|   the_greek_s|              12|
|spinach_supr_s|              13|
|spinach_supr_s|              14|
| classic_dlx_s|              15|
|green_garden_s|              16|
| ital_cpcllo_l|              17|
|   ital_supr_l|              18|
|   ital_supr_s|              19|
|    mexicana_s|              20|
+--------------+----------------+
only showing top 20 rows



In [23]:
data.dtypes

[('order_details_id', 'int'),
 ('order_id', 'int'),
 ('pizza_id', 'string'),
 ('quantity', 'int')]

In [25]:
data.describe().show()

+-------+------------------+------------------+------------+------------------+
|summary|  order_details_id|          order_id|    pizza_id|          quantity|
+-------+------------------+------------------+------------+------------------+
|  count|             48620|             48620|       48620|             48620|
|   mean|           24310.5|10701.479761415056|        NULL|1.0196215549156726|
| stddev|14035.529380824935|6180.1197703776215|        NULL|0.1430770093247217|
|    min|                 1|                 1|   bbq_ckn_l|                 1|
|    max|             48620|             21350|veggie_veg_s|                 4|
+-------+------------------+------------------+------------+------------------+



### Adding Columns in data farme

In [49]:
temp_data = data.withColumn('new colomn', data['quantity']+2)
temp_data.show()

+----------------+--------+--------------+--------+----------+
|order_details_id|order_id|      pizza_id|quantity|new colomn|
+----------------+--------+--------------+--------+----------+
|               1|       1|    hawaiian_m|       1|         3|
|               2|       2| classic_dlx_m|       1|         3|
|               3|       2| five_cheese_l|       1|         3|
|               4|       2|   ital_supr_l|       1|         3|
|               5|       2|    mexicana_m|       1|         3|
|               6|       2|    thai_ckn_l|       1|         3|
|               7|       3|   ital_supr_m|       1|         3|
|               8|       3|  prsc_argla_l|       1|         3|
|               9|       4|   ital_supr_m|       1|         3|
|              10|       5|   ital_supr_m|       1|         3|
|              11|       6|     bbq_ckn_s|       1|         3|
|              12|       6|   the_greek_s|       1|         3|
|              13|       7|spinach_supr_s|       1|    

### Drop Columns in data frame

In [50]:
temp_data = temp_data.drop('new colomn')
temp_data.show()

+----------------+--------+--------------+--------+
|order_details_id|order_id|      pizza_id|quantity|
+----------------+--------+--------------+--------+
|               1|       1|    hawaiian_m|       1|
|               2|       2| classic_dlx_m|       1|
|               3|       2| five_cheese_l|       1|
|               4|       2|   ital_supr_l|       1|
|               5|       2|    mexicana_m|       1|
|               6|       2|    thai_ckn_l|       1|
|               7|       3|   ital_supr_m|       1|
|               8|       3|  prsc_argla_l|       1|
|               9|       4|   ital_supr_m|       1|
|              10|       5|   ital_supr_m|       1|
|              11|       6|     bbq_ckn_s|       1|
|              12|       6|   the_greek_s|       1|
|              13|       7|spinach_supr_s|       1|
|              14|       8|spinach_supr_s|       1|
|              15|       9| classic_dlx_s|       1|
|              16|       9|green_garden_s|       1|
|           

### Rename the Columns in data frame

In [53]:
temp_data = temp_data.withColumnRenamed('pizza_id', 'new_pizza_id')
temp_data.show()

+----------------+--------+--------------+--------+
|order_details_id|order_id|  new_pizza_id|quantity|
+----------------+--------+--------------+--------+
|               1|       1|    hawaiian_m|       1|
|               2|       2| classic_dlx_m|       1|
|               3|       2| five_cheese_l|       1|
|               4|       2|   ital_supr_l|       1|
|               5|       2|    mexicana_m|       1|
|               6|       2|    thai_ckn_l|       1|
|               7|       3|   ital_supr_m|       1|
|               8|       3|  prsc_argla_l|       1|
|               9|       4|   ital_supr_m|       1|
|              10|       5|   ital_supr_m|       1|
|              11|       6|     bbq_ckn_s|       1|
|              12|       6|   the_greek_s|       1|
|              13|       7|spinach_supr_s|       1|
|              14|       8|spinach_supr_s|       1|
|              15|       9| classic_dlx_s|       1|
|              16|       9|green_garden_s|       1|
|           