In [1]:
import pandas as pd
import numpy as np

from seaborn import load_dataset
from pydataset import data
from vega_datasets import data as vega

from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when, col, date_trunc

import pyspark; spark = pyspark.sql.SparkSession.builder.getOrCreate(); spark

In [2]:
pandas_df = pd.DataFrame(data=['HTML', 'SQL', 'JavaScript', 'Python',\
                               'C++', 'Java', 'C#', 'CSS'], columns=['language'])

In [3]:
df = spark.createDataFrame(pandas_df)

In [4]:
df.show()

+----------+
|  language|
+----------+
|      HTML|
|       SQL|
|JavaScript|
|    Python|
|       C++|
|      Java|
|        C#|
|       CSS|
+----------+



In [5]:
df.schema

StructType(List(StructField(language,StringType,true)))

In [6]:
df.count()

8

In [7]:
len(df.columns)

1

In [8]:
#2. load mpg to dataframe

In [9]:
df = spark.createDataFrame(data('mpg'))

In [10]:
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [11]:
df.filter(df.year > 2000).select(concat(lit('the '),'year',lit(' '),'manufacturer',lit(' '),'model',lit(' has a '),'cyl',lit(' cylinder engine.'))).show(5)

+------------------------------------------------------------------------------+
|concat(the , year,  , manufacturer,  , model,  has a , cyl,  cylinder engine.)|
+------------------------------------------------------------------------------+
|                                                          the 2008 audi a4 ...|
|                                                          the 2008 audi a4 ...|
|                                                          the 2008 audi a4 ...|
|                                                          the 2008 audi a4 ...|
|                                                          the 2008 audi a4 ...|
+------------------------------------------------------------------------------+
only showing top 5 rows



In [12]:
df.select('manufacturer', 'model', 'year', (when( df.trans.contains('auto'), 'auto').otherwise('manual')).alias('trans')).show(5)

+------------+-----+----+------+
|manufacturer|model|year| trans|
+------------+-----+----+------+
|        audi|   a4|1999|  auto|
|        audi|   a4|1999|manual|
|        audi|   a4|2008|manual|
|        audi|   a4|2008|  auto|
|        audi|   a4|1999|  auto|
+------------+-----+----+------+
only showing top 5 rows



In [13]:
df = spark.createDataFrame(data('tips'))

In [14]:
'{:.2f} % are smokers'.format((df.filter(df.smoker == "Yes").count() / df.count() * 100))


'38.11 % are smokers'

In [15]:
df = df.select('*',(df.tip / df.total_bill * 100).alias('tip_pct'))
df.show(1)

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_pct|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|5.9446733372572105|
+----------+----+------+------+---+------+----+------------------+
only showing top 1 row



In [16]:
df.groupby('sex','smoker').agg(mean('tip_pct')).show()

+------+------+------------------+
|   sex|smoker|      avg(tip_pct)|
+------+------+------------------+
|  Male|    No| 16.06687151291298|
|  Male|   Yes|15.277117520248513|
|Female|    No| 15.69209707691836|
|Female|   Yes| 18.21503526994103|
+------+------+------------------+



In [17]:
df = spark.createDataFrame(vega('seattle_weather'))

In [18]:
df = df.select('date', 'precipitation',
               ((df.temp_max * 1.8) + 32).alias('temp_max'),
               ((df.temp_min * 1.8) + 32).alias('temp_min'),
               'wind', 'weather')

In [20]:
df.groupby(date_trunc('month', col('date')).alias('month'))\
            .agg(mean('precipitation').alias('avg_pcp'))\
            .sort(col('avg_pcp').desc()).show(1)

+-------------------+-----------------+
|              month|          avg_pcp|
+-------------------+-----------------+
|2015-12-01 00:00:00|9.177419354838714|
+-------------------+-----------------+
only showing top 1 row



In [21]:
df.groupby(date_trunc('year', col('date')).alias('year'))\
            .agg(mean('wind').alias('avg_wind'))\
            .sort(col('avg_wind').desc()).show(1)

+-------------------+-----------------+
|               year|         avg_wind|
+-------------------+-----------------+
|2012-01-01 00:00:00|3.400819672131148|
+-------------------+-----------------+
only showing top 1 row



In [34]:
df.where(df.date.contains('-01-')).groupby('weather').count().show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|drizzle|   10|
|   rain|   35|
|    sun|   33|
|   snow|    8|
+-------+-----+



In [32]:
where = (df.weather == 'sun' ) & ((df.date > "2012") | (df.date < "2015"))
df.where(where).groupby('weather').mean('temp_max', 'temp_min').show()

+-------+----------------+------------------+
|weather|   avg(temp_max)|     avg(temp_min)|
+-------+----------------+------------------+
|    sun|66.8529411764706|48.695882352941176|
+-------+----------------+------------------+

