In [102]:
import pandas as pd
import numpy as np

from seaborn import load_dataset
from pydataset import data
from vega_datasets import data as vega

from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

import pyspark; spark = pyspark.sql.SparkSession.builder.getOrCreate(); spark

In [103]:
pandas_df = pd.DataFrame(data=['Python', 'HTML', 'SQL', 'JavaScript', 'Virtual Basic',\
                               'C++', 'PHP', 'PERL', 'C#', 'CSS'], columns=['language'])

In [104]:
df = spark.createDataFrame(pandas_df)

In [105]:
df.show()

+-------------+
|     language|
+-------------+
|       Python|
|         HTML|
|          SQL|
|   JavaScript|
|Virtual Basic|
|          C++|
|          PHP|
|         PERL|
|           C#|
|          CSS|
+-------------+



In [106]:
df.schema

StructType(List(StructField(language,StringType,true)))

In [107]:
df.count()

10

In [108]:
len(df.columns)

1

In [109]:
#2. load mpg to dataframe

In [110]:
df = spark.createDataFrame(data('mpg'))

In [111]:
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [112]:
df.filter(df.year > 2000).select(fn.concat(lit('The '),'year',lit(' '),'manufacturer',lit(' '),'model',lit(' has a '),'cyl',lit(' cylinder engine.'))).show(5)

+------------------------------------------------------------------------------+
|concat(The , year,  , manufacturer,  , model,  has a , cyl,  cylinder engine.)|
+------------------------------------------------------------------------------+
|                                                          The 2008 audi a4 ...|
|                                                          The 2008 audi a4 ...|
|                                                          The 2008 audi a4 ...|
|                                                          The 2008 audi a4 ...|
|                                                          The 2008 audi a4 ...|
+------------------------------------------------------------------------------+
only showing top 5 rows



In [113]:
df.select('manufacturer', 'model', 'year', (when( df.trans.contains('auto'), 'auto').otherwise('manual')).alias('trans')).show(5)

+------------+-----+----+------+
|manufacturer|model|year| trans|
+------------+-----+----+------+
|        audi|   a4|1999|  auto|
|        audi|   a4|1999|manual|
|        audi|   a4|2008|manual|
|        audi|   a4|2008|  auto|
|        audi|   a4|1999|  auto|
+------------+-----+----+------+
only showing top 5 rows



In [114]:
df = spark.createDataFrame(data('tips'))

In [115]:
f'{df.filter(df.smoker == "Yes").count() / df.select("smoker").count():.2%}'

'38.11%'

In [116]:
df = df.select('*',(df.tip / df.total_bill * 100).alias('tip_pct'))
df.show(1)

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_pct|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|5.9446733372572105|
+----------+----+------+------+---+------+----+------------------+
only showing top 1 row



In [117]:
df.groupby('smoker', 'sex').agg(mean('tip_pct')).show()

+------+------+------------------+
|smoker|   sex|      avg(tip_pct)|
+------+------+------------------+
|    No|Female| 15.69209707691836|
|    No|  Male| 16.06687151291298|
|   Yes|  Male|15.277117520248513|
|   Yes|Female| 18.21503526994103|
+------+------+------------------+



In [118]:
df = spark.createDataFrame(vega('seattle_weather'))

In [119]:
df = df.select('date', 'precipitation',
               ((df.temp_max * 1.8) + 32).alias('temp_max'),
               ((df.temp_min * 1.8) + 32).alias('temp_min'),
               'wind', 'weather')

In [120]:
df.groupby(fn.date_trunc('month', fn.col('date')).alias('month'))\
            .agg(mean('precipitation').alias('avg_pcp'))\
            .sort(fn.col('avg_pcp').desc()).show(1)

+-------------------+-----------------+
|              month|          avg_pcp|
+-------------------+-----------------+
|2015-12-01 00:00:00|9.177419354838714|
+-------------------+-----------------+
only showing top 1 row



In [121]:
df.groupby(fn.date_trunc('year', fn.col('date')).alias('year'))\
            .agg(mean('wind').alias('avg_wind'))\
            .sort(fn.col('avg_wind').desc()).show(1)

+-------------------+-----------------+
|               year|         avg_wind|
+-------------------+-----------------+
|2012-01-01 00:00:00|3.400819672131148|
+-------------------+-----------------+
only showing top 1 row



In [122]:
df.where(fn.expr('date LIKE "%-01-%"')).groupby('weather').count().show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|drizzle|   10|
|   rain|   35|
|    sun|   33|
|   snow|    8|
+-------+-----+



In [124]:
mask = (df.weather == 'sun' ) & (fn.expr('date LIKE "2013-06-%"') | fn.expr('date LIKE "2014-06-%"'))
df.where(mask).groupby('weather').mean('temp_max', 'temp_min').show()

+-------+-----------------+-----------------+
|weather|    avg(temp_max)|    avg(temp_min)|
+-------+-----------------+-----------------+
|    sun|73.31918367346938|54.27959183673469|
+-------+-----------------+-----------------+

