In [43]:
import seaborn as sns
import pandas as pd

from pyspark.sql import SparkSession

In [44]:
spark = SparkSession.builder.appName('sparksql').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('tips'))
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [45]:
df.createOrReplaceTempView('tips_table')

In [47]:
from pyspark.sql.functions import col
#dame tres columnas filtro normal con dataFrame de spark
df.filter(col('tip') > 2).show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [48]:
#hacer el mismo filtro pero con sql
spark.sql('SELECT * FROM tips_table WHERE tip > 2;').show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [49]:
#group by con Dataframes
from pyspark.sql.functions import avg

df.groupBy('day').agg(
    avg('total_bill').alias('avg_total_bill'),
    avg('tip').alias('avg_tip')
).show()

+----+------------------+------------------+
| day|    avg_total_bill|           avg_tip|
+----+------------------+------------------+
| Sun|21.409999999999997|3.2551315789473683|
| Sat|20.441379310344825| 2.993103448275862|
|Thur|17.682741935483868|2.7714516129032254|
| Fri| 17.15157894736842| 2.734736842105263|
+----+------------------+------------------+



In [None]:
# con sql
spark.sql('''
          SELECT day, 
          avg(total_bill) as avg_total_bill,
          avg(tip) as avg_tip
          FROM tips_table
          group by day;
          ''').show()

+----+------------------+------------------+
| day|    avg_total_bill|           avg_tip|
+----+------------------+------------------+
| Sun|21.409999999999997|3.2551315789473683|
| Sat|20.441379310344825| 2.993103448275862|
|Thur|17.682741935483868|2.7714516129032254|
| Fri| 17.15157894736842| 2.734736842105263|
+----+------------------+------------------+



In [51]:
df.orderBy(col('total_bill').desc()).limit(5).show()

+----------+----+----+------+---+------+----+
|total_bill| tip| sex|smoker|day|  time|size|
+----------+----+----+------+---+------+----+
|     50.81|10.0|Male|   Yes|Sat|Dinner|   3|
|     48.33| 9.0|Male|    No|Sat|Dinner|   4|
|     48.27|6.73|Male|    No|Sat|Dinner|   4|
|     48.17| 5.0|Male|    No|Sun|Dinner|   6|
|     45.35| 3.5|Male|   Yes|Sun|Dinner|   3|
+----------+----+----+------+---+------+----+



In [52]:
spark.sql('''
          SELECT total_bill, tip, sex
          FROM tips_table
          ORDER BY total_bill DESC
          LIMIT 5;
          ''').show()

+----------+----+----+
|total_bill| tip| sex|
+----------+----+----+
|     50.81|10.0|Male|
|     48.33| 9.0|Male|
|     48.27|6.73|Male|
|     48.17| 5.0|Male|
|     45.35| 3.5|Male|
+----------+----+----+



In [54]:
# Agregar una nueva columna
from pyspark.sql.functions import expr
df.withColumn('tip_ratio', expr('tip / total_bill')).show(4)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|          tip_ratio|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
+----------+----+------+------+---+------+----+-------------------+
only showing top 4 rows



In [55]:
spark.sql('''
          SELECT *,
          tip / total_bill as tip_ratio
          FROM tips_table;
          ''').show(4)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|          tip_ratio|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
+----------+----+------+------+---+------+----+-------------------+
only showing top 4 rows



In [56]:
# en vez de hacer un show se puede hacer retornar a un dataframe directamente
df_tips_ratio = spark.sql('''
          SELECT *,
          tip / total_bill as tip_ratio
          FROM tips_table;
          ''')

df_tips_ratio.show(2)
df_tips_ratio.printSchema()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|          tip_ratio|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
+----------+----+------+------+---+------+----+-------------------+
only showing top 2 rows

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)
 |-- tip_ratio: double (nullable = true)



In [61]:
from pyspark.sql.functions import when

df.withColumn('tip_category', when(col('tip') > 3, 'alta').otherwise('baja')).show(4)

+----------+----+------+------+---+------+----+------------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_category|
+----------+----+------+------+---+------+----+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        baja|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        baja|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        alta|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        alta|
+----------+----+------+------+---+------+----+------------+
only showing top 4 rows



In [62]:
spark.sql('''
          SELECT *,
          CASE WHEN tip > 3 THEN 'alta' ELSE 'baja' END as tip_category 
          FROM tips_table;
          ''').show(4)

+----------+----+------+------+---+------+----+------------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_category|
+----------+----+------+------+---+------+----+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        baja|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        baja|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        alta|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        alta|
+----------+----+------+------+---+------+----+------------+
only showing top 4 rows



In [63]:
#group by por dos campos
from pyspark.sql.functions import count
df.groupBy('day', 'time').agg(count('*').alias('count_bookings')).show()

+----+------+--------------+
| day|  time|count_bookings|
+----+------+--------------+
| Sat|Dinner|            87|
| Sun|Dinner|            76|
|Thur| Lunch|            61|
| Fri|Dinner|            12|
| Fri| Lunch|             7|
|Thur|Dinner|             1|
+----+------+--------------+



In [65]:
spark.sql('''
          SELECT day, time, count(*) as count_booking
          FROM tips_table
          GROUP BY day, time;
          ''').show()

+----+------+-------------+
| day|  time|count_booking|
+----+------+-------------+
| Sat|Dinner|           87|
| Sun|Dinner|           76|
|Thur| Lunch|           61|
| Fri|Dinner|           12|
| Fri| Lunch|            7|
|Thur|Dinner|            1|
+----+------+-------------+

