O’Reilly - JOSE

## Creating a DataFrame

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create").getOrCreate()
# Create dataframe from tuples
df = spark.createDataFrame([(0,'a'),(1,'b')],['id','letter'])
print(type(df))
df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+---+------+
| id|letter|
+---+------+
|  0|     a|
|  1|     b|
+---+------+



### Data Input

In [2]:
df = spark.read.json('people.json')
df.show()

+----+-------+------+
| age|   name|weight|
+----+-------+------+
|null|Michael|   100|
|  30|   Andy|    40|
|  19| Justin|  null|
|  16|    Ali|    50|
|  40| Rocker|    60|
+----+-------+------+



In [4]:
newdf = spark.read.csv('Name_Age.csv',inferSchema=True,header=True)
newdf

DataFrame[Name: string, Age: int]

In [5]:
newdf.show()

+------+---+
|  Name|Age|
+------+---+
|  John| 43|
| Cindy| 25|
|Lauren| 32|
+------+---+



### Data Output

In [7]:
df.write.csv("new_data_output.csv",header=True)
spark.read.csv("new_data_output.csv",header=True,inferSchema=True).show()

+----+-------+------+
| age|   name|weight|
+----+-------+------+
|null|Michael|   100|
|  30|   Andy|    40|
|  19| Justin|  null|
|  16|    Ali|    50|
|  40| Rocker|    60|
+----+-------+------+



**Statistical Properties**

In [8]:
print(type(df))
df.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)
 |-- weight: long (nullable = true)



In [9]:
df.columns

['age', 'name', 'weight']

In [12]:
df.describe().show()

+-------+------------------+------+------------------+
|summary|               age|  name|            weight|
+-------+------------------+------+------------------+
|  count|                 4|     5|                 4|
|   mean|             26.25|  null|              62.5|
| stddev|10.965856099730654|  null|26.299556396765833|
|    min|                16|   Ali|                40|
|    max|                40|Rocker|               100|
+-------+------------------+------+------------------+



**Selecting Columns and Rows**

In [13]:
type(df['age'])

pyspark.sql.column.Column

In [14]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
|  16|
|  40|
+----+



In [16]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [17]:
df.head(2)

[Row(age=None, name='Michael', weight=100),
 Row(age=30, name='Andy', weight=40)]

In [18]:
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
|  16|    Ali|
|  40| Rocker|
+----+-------+



**Creating and Renaming Columns**

In [19]:
df.withColumn('newage',df['age']).show()

+----+-------+------+------+
| age|   name|weight|newage|
+----+-------+------+------+
|null|Michael|   100|  null|
|  30|   Andy|    40|    30|
|  19| Justin|  null|    19|
|  16|    Ali|    50|    16|
|  40| Rocker|    60|    40|
+----+-------+------+------+



In [21]:
df.withColumnRenamed('age','supernewage').show()
# Not assigining operations to dataframe

+-----------+-------+------+
|supernewage|   name|weight|
+-----------+-------+------+
|       null|Michael|   100|
|         30|   Andy|    40|
|         19| Justin|  null|
|         16|    Ali|    50|
|         40| Rocker|    60|
+-----------+-------+------+



In [22]:
df.withColumn('doubleage',df['age']*2).show()

+----+-------+------+---------+
| age|   name|weight|doubleage|
+----+-------+------+---------+
|null|Michael|   100|     null|
|  30|   Andy|    40|       60|
|  19| Justin|  null|       38|
|  16|    Ali|    50|       32|
|  40| Rocker|    60|       80|
+----+-------+------+---------+



In [23]:
newdf=df.withColumn('nweight',df['weight']+1)
newdf.show()

+----+-------+------+-------+
| age|   name|weight|nweight|
+----+-------+------+-------+
|null|Michael|   100|    101|
|  30|   Andy|    40|     41|
|  19| Justin|  null|   null|
|  16|    Ali|    50|     51|
|  40| Rocker|    60|     61|
+----+-------+------+-------+



**Using SQL with DataFrames**

In [24]:
# Register DataFrame as a temporary view
df.createOrReplaceTempView('people')

In [25]:
sql_results=spark.sql("select * from people")
sql_results

DataFrame[age: bigint, name: string, weight: bigint]

In [26]:
sql_results.show()

+----+-------+------+
| age|   name|weight|
+----+-------+------+
|null|Michael|   100|
|  30|   Andy|    40|
|  19| Justin|  null|
|  16|    Ali|    50|
|  40| Rocker|    60|
+----+-------+------+



In [30]:
spark.sql('select * from people where age=30').show()

+---+----+------+
|age|name|weight|
+---+----+------+
| 30|Andy|    40|
+---+----+------+



In [28]:
spark.sql('select * from people where age>20').show()

+---+------+------+
|age|  name|weight|
+---+------+------+
| 30|  Andy|    40|
| 40|Rocker|    60|
+---+------+------+



**Filtering the Data**

In [29]:
df=spark.read.csv('citigroup_stock_2007to2010.csv', inferSchema=True, header=True)
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [31]:
df.filter('Close<50').show(5)

+----------+------------------+---------+---------+---------+-------+------------------+
|      Date|              Open|     High|      Low|    Close| Volume|         Adj Close|
+----------+------------------+---------+---------+---------+-------+------------------+
|2007-03-02|50.920002000000004|51.209998|49.850001|49.970002|2617300|        447.823116|
|2007-03-05|         49.600001|50.550001|    49.18|49.250001|2491500|441.37058099999996|
|2007-03-13|         50.010001|    50.09|48.750001|48.750001|2928600|        436.889662|
|2007-03-14|         48.800001|49.320002|48.050001|    49.08|3044100|        439.847058|
|2007-03-16|         50.010001|    50.38|    49.19|    49.53|3081400|443.87988499999994|
+----------+------------------+---------+---------+---------+-------+------------------+
only showing top 5 rows



In [32]:
df.filter('Close<50').select('Close').show(5)

+---------+
|    Close|
+---------+
|49.970002|
|49.250001|
|48.750001|
|    49.08|
|    49.53|
+---------+
only showing top 5 rows



In [33]:
df.filter(df['Close']<50).select('Close').show(5)

+---------+
|    Close|
+---------+
|49.970002|
|49.250001|
|48.750001|
|    49.08|
|    49.53|
+---------+
only showing top 5 rows



In [34]:
df.filter((df['Close']<49) & (df['open']>45)).show(5)

+----------+---------+---------+---------+---------+-------+------------------+
|      Date|     Open|     High|      Low|    Close| Volume|         Adj Close|
+----------+---------+---------+---------+---------+-------+------------------+
|2007-03-13|50.010001|    50.09|48.750001|48.750001|2928600|        436.889662|
|2007-07-26|    48.49|    48.68|46.500001|47.810001|7731600|432.76932300000004|
|2007-07-27|47.710001|    48.19|46.710001|46.970002|5591700|425.16576799999996|
|2007-07-30|47.320002|47.650001|46.700001|    47.19|4024800|427.15716100000003|
|2007-07-31|47.570002|48.260001|46.500001|46.570002|4678800|        421.545025|
+----------+---------+---------+---------+---------+-------+------------------+
only showing top 5 rows



In [35]:
result=df.filter(df['Low']>25).collect()
print(type(result))
result[0]

<class 'list'>


Row(Date='2007-01-03', Open=55.659998, High=56.28, Low=54.720002, Close=55.250001, Volume=2282100, Adj Close=490.291686)

In [36]:
result[0][0]

'2007-01-03'

**Introduction to Date and Timestamps**

In [37]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('dates').getOrCreate()

In [38]:
df=spark.read.csv('citigroup_stock_2007to2010.csv', header=True, inferSchema=True)
df.show(5)

+----------+------------------+---------+------------------+------------------+-------+------------------+
|      Date|              Open|     High|               Low|             Close| Volume|         Adj Close|
+----------+------------------+---------+------------------+------------------+-------+------------------+
|2007-01-03|         55.659998|    56.28|         54.720002|         55.250001|2282100|        490.291686|
|2007-01-04|         55.250001|56.150001|         54.720002|         55.059998|1658600|488.60559299999994|
|2007-01-05|         55.000001|55.050001|         54.459998|54.770002000000005|1317800|        486.032149|
|2007-01-08|         54.600001|55.150001|         54.300001|         55.050001|1236900|488.51687400000003|
|2007-01-09|55.009997999999996|55.150001|54.190003000000004|         54.570002|1963000|        484.257338|
+----------+------------------+---------+------------------+------------------+-------+------------------+
only showing top 5 rows



In [39]:
df.head()

Row(Date='2007-01-03', Open=55.659998, High=56.28, Low=54.720002, Close=55.250001, Volume=2282100, Adj Close=490.291686)

In [42]:
from pyspark.sql.functions import dayofmonth, dayofyear, weekofyear, month, year
print(df.select('Date',dayofmonth(df['Date'])).show(5))
print(df.select('Date',dayofyear(df['Date'])).show(5))

+----------+----------------+
|      Date|dayofmonth(Date)|
+----------+----------------+
|2007-01-03|               3|
|2007-01-04|               4|
|2007-01-05|               5|
|2007-01-08|               8|
|2007-01-09|               9|
+----------+----------------+
only showing top 5 rows

None
+----------+---------------+
|      Date|dayofyear(Date)|
+----------+---------------+
|2007-01-03|              3|
|2007-01-04|              4|
|2007-01-05|              5|
|2007-01-08|              8|
|2007-01-09|              9|
+----------+---------------+
only showing top 5 rows

None


In [45]:
df.select('Date',year(df['Date'])).show(5),df.select(month(df['Date'])).show(5)

+----------+----------+
|      Date|year(Date)|
+----------+----------+
|2007-01-03|      2007|
|2007-01-04|      2007|
|2007-01-05|      2007|
|2007-01-08|      2007|
|2007-01-09|      2007|
+----------+----------+
only showing top 5 rows

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 5 rows



(None, None)

**Working With Timestamps**

In [46]:
from pyspark.sql.functions import hour, minute, year, format_number

In [47]:
df.select('Date',hour(df['Date'])).show(5), df.select(minute(df['Date'])).show(5)

+----------+----------+
|      Date|hour(Date)|
+----------+----------+
|2007-01-03|         0|
|2007-01-04|         0|
|2007-01-05|         0|
|2007-01-08|         0|
|2007-01-09|         0|
+----------+----------+
only showing top 5 rows

+------------+
|minute(Date)|
+------------+
|           0|
|           0|
|           0|
|           0|
|           0|
+------------+
only showing top 5 rows



(None, None)

In [52]:
# AVerage Closing prices per Year
newdf=df.withColumn('Year', year(df['Date']))
newdf.show(5)

+----------+------------------+---------+------------------+------------------+-------+------------------+----+
|      Date|              Open|     High|               Low|             Close| Volume|         Adj Close|Year|
+----------+------------------+---------+------------------+------------------+-------+------------------+----+
|2007-01-03|         55.659998|    56.28|         54.720002|         55.250001|2282100|        490.291686|2007|
|2007-01-04|         55.250001|56.150001|         54.720002|         55.059998|1658600|488.60559299999994|2007|
|2007-01-05|         55.000001|55.050001|         54.459998|54.770002000000005|1317800|        486.032149|2007|
|2007-01-08|         54.600001|55.150001|         54.300001|         55.050001|1236900|488.51687400000003|2007|
|2007-01-09|55.009997999999996|55.150001|54.190003000000004|         54.570002|1963000|        484.257338|2007|
+----------+------------------+---------+------------------+------------------+-------+-----------------

In [53]:
newdf.groupBy('Year').mean()[['avg(Year)','avg(Close)']].show()

+---------+------------------+
|avg(Year)|        avg(Close)|
+---------+------------------+
|   2007.0|47.782032505976076|
|   2008.0| 19.04865647826085|
|   2009.0|3.6571825396825375|
+---------+------------------+



In [54]:
# Same Calcualations nicely formatted
result=newdf.groupBy('Year').mean()[['avg(Year)','avg(Close)']]
result=result.withColumnRenamed('avg(Year)','Year')
result=result.select('Year', format_number('avg(Close)',2).alias('Mean Close'))
result.show()

+------+----------+
|  Year|Mean Close|
+------+----------+
|2007.0|     47.78|
|2008.0|     19.05|
|2009.0|      3.66|
+------+----------+



**Introduction to Aggregate and GroupBy Concepts**

In [55]:
df = spark.read.csv('sales.csv',inferSchema=True,header=True)
print(df.printSchema())
df.show(5)

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)

None
+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
+-------+-------+-----+
only showing top 5 rows



In [56]:
df.groupBy('Company')

<pyspark.sql.group.GroupedData at 0x148c875c5c8>

In [57]:
df.groupBy('Company').mean().show(),df.groupBy('Company').mean()[['Company','avg(Sales)']].show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   GOOG|            220.0|
|   MSFT|322.3333333333333|
|     FB|            610.0|
|   APPL|            370.0|
+-------+-----------------+

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   GOOG|            220.0|
|   MSFT|322.3333333333333|
|     FB|            610.0|
|   APPL|            370.0|
+-------+-----------------+



(None, None)

In [58]:
by_company=df.groupBy('Company')
by_company.sum().show(),by_company.count().show()

+-------+----------+
|Company|sum(Sales)|
+-------+----------+
|   GOOG|     660.0|
|   MSFT|     967.0|
|     FB|    1220.0|
|   APPL|    1480.0|
+-------+----------+

+-------+-----+
|Company|count|
+-------+-----+
|   GOOG|    3|
|   MSFT|    3|
|     FB|    2|
|   APPL|    4|
+-------+-----+



(None, None)

**Spark Built in Aggregate Methods**

In [59]:
from pyspark.sql.functions import countDistinct, avg, stddev,format_number

In [60]:
df.show(3)

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
+-------+-------+-----+
only showing top 3 rows



In [61]:
df.select(countDistinct('Sales')).show()

+---------------------+
|count(DISTINCT Sales)|
+---------------------+
|                   11|
+---------------------+



In [62]:
df.select(countDistinct('Company')).show()

+-----------------------+
|count(DISTINCT Company)|
+-----------------------+
|                      4|
+-----------------------+



In [63]:
df.select(avg('Sales').alias('Average Sales')).show()

+-----------------+
|    Average Sales|
+-----------------+
|360.5833333333333|
+-----------------+



In [65]:
sales_std = df.select(stddev("Sales").alias('Sales Std'))
sales_std.show()

+------------------+
|         Sales Std|
+------------------+
|250.08742410799007|
+------------------+



In [66]:
sales_std.select(format_number('Sales Std',2).alias('Sales Std')).show()

+---------+
|Sales Std|
+---------+
|   250.09|
+---------+



**Sorting and Ordering**

In [67]:
df.orderBy('Sales').show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



In [68]:
df.orderBy(df['Sales'].desc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|     FB|   Carl|870.0|
|   APPL|   Mike|750.0|
|   MSFT|   Tina|600.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   GOOG|  Frank|340.0|
|   APPL|   John|250.0|
|   MSFT|Vanessa|243.0|
|   GOOG|    Sam|200.0|
|   APPL|  Linda|130.0|
|   MSFT|    Amy|124.0|
|   GOOG|Charlie|120.0|
+-------+-------+-----+



In [69]:
df.orderBy('Company').show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   APPL|  Chris|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   GOOG|  Frank|340.0|
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
+-------+-------+-----+



**Introduction to Missing_Data**

In [70]:
df = spark.read.csv("MissingData.csv",header=True,inferSchema=True)
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [71]:
df.na.drop().show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+



In [73]:
df.na.drop(thresh=2).show()
#Have Atleast two non-null values to be considered for drop

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [74]:
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [76]:
df.na.drop(how='any').show(),df.na.drop(how='all').show()
# any: drop rows containing any Null values
# all: drop rows if all values are null

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp4|Cindy|456.0|
+----+-----+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



(None, None)

In [78]:
df.na.fill('NEW VALUE').show(),df.na.fill(0).show()
# New values gets filled where data type is string
#  0 gets filled only for numeric nulls

+----+---------+-----+
|  Id|     Name|Sales|
+----+---------+-----+
|emp1|     John| null|
|emp2|NEW VALUE| null|
|emp3|NEW VALUE|345.0|
|emp4|    Cindy|456.0|
+----+---------+-----+

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



(None, None)

In [80]:
df.na.fill('No Name', subset=['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [81]:
from pyspark.sql.functions import mean
mean_val = df.select(mean(df['Sales'])).collect()
mean_val

[Row(avg(Sales)=400.5)]

In [82]:
mean_sales = mean_val[0][0]
mean_sales

400.5

In [83]:
df.na.fill(mean_sales,['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Practice

In [84]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("tesla").getOrCreate()
df = spark.read.csv('tesla_stock_2013_2017.csv',header=True,inferSchema=True)

In [87]:
# What are the column names?
print(df.columns)

#Print out the first 5 rows.
for row in df.head(5):
    print(row)
    print('\n')

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
Row(Date='2013-01-02', Open=35.0, High=35.450001, Low=34.709998999999996, Close=35.360001000000004, Volume=1194800, Adj Close=35.360001000000004)


Row(Date='2013-01-03', Open=35.18, High=35.450001, Low=34.75, Close=34.77, Volume=742000, Adj Close=34.77)


Row(Date='2013-01-04', Open=34.799999, High=34.799999, Low=33.919998, Close=34.400002, Volume=674000, Adj Close=34.400002)


Row(Date='2013-01-07', Open=34.799999, High=34.799999, Low=33.900002, Close=34.34, Volume=442000, Adj Close=34.34)


Row(Date='2013-01-08', Open=34.5, High=34.5, Low=33.110001000000004, Close=33.68, Volume=1284000, Adj Close=33.68)




In [88]:
# Use describe() to learn about the DataFrame.
print(df.describe().show())

+-------+----------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+
|summary|      Date|             Open|              High|               Low|            Close|           Volume|        Adj Close|
+-------+----------+-----------------+------------------+------------------+-----------------+-----------------+-----------------+
|  count|      1008|             1008|              1008|              1008|             1008|             1008|             1008|
|   mean|      null| 191.930555563492|195.27951362400796|188.38569448115112|191.8851191478172|6062732.837301588|191.8851191478172|
| stddev|      null|61.51794180545916| 62.12424917982305| 60.76776038944237|61.45363572561643|4545340.194292408|61.45363572561643|
|    min|2013-01-02|        33.080002|         33.380001|32.110001000000004|            32.91|           440200|            32.91|
|    max|2016-12-30|       287.670013|        291.420013|        280.399994|       

In [89]:
# What day had the Peak High in Price?
df.orderBy(df["High"].desc()).head(1)

[Row(Date='2014-09-04', Open=284.01001, High=291.420013, Low=280.399994, Close=286.040009, Volume=8341700, Adj Close=286.040009)]

In [91]:
df.orderBy(df["High"].desc()).head(1)[0][0]

'2014-09-04'

In [92]:
#What is the mean of the Close column?
from pyspark.sql.functions import mean
df.select(mean("Close")).show()

+-----------------+
|       avg(Close)|
+-----------------+
|191.8851191478172|
+-----------------+



In [93]:
#What is the max and min of the Volume column?
from pyspark.sql.functions import max,min
df.select(max("Volume"),min("Volume")).show()

+-----------+-----------+
|max(Volume)|min(Volume)|
+-----------+-----------+
|   37163900|     440200|
+-----------+-----------+



In [94]:
#How many days was the Close lower than 60 dollars? (Use Spark SQL notation)
df.filter("Close < 60").count()

88

In [95]:
#What was the max High for every year? ( Use GroupBy)

from pyspark.sql.functions import year
yeardf = df.withColumn("Year",year(df["Date"]))
yeardf.show(5)

+----------+---------+---------+------------------+------------------+-------+------------------+----+
|      Date|     Open|     High|               Low|             Close| Volume|         Adj Close|Year|
+----------+---------+---------+------------------+------------------+-------+------------------+----+
|2013-01-02|     35.0|35.450001|34.709998999999996|35.360001000000004|1194800|35.360001000000004|2013|
|2013-01-03|    35.18|35.450001|             34.75|             34.77| 742000|             34.77|2013|
|2013-01-04|34.799999|34.799999|         33.919998|         34.400002| 674000|         34.400002|2013|
|2013-01-07|34.799999|34.799999|         33.900002|             34.34| 442000|             34.34|2013|
|2013-01-08|     34.5|     34.5|33.110001000000004|             33.68|1284000|             33.68|2013|
+----------+---------+---------+------------------+------------------+-------+------------------+----+
only showing top 5 rows



In [96]:
max_df = yeardf.groupBy('Year').max().show(5)

+----+------------------+----------+------------------+----------+-----------+--------------+---------+
|Year|         max(Open)| max(High)|          max(Low)|max(Close)|max(Volume)|max(Adj Close)|max(Year)|
+----+------------------+----------+------------------+----------+-----------+--------------+---------+
|2013|193.96000700000002|     194.5|        188.369995|193.369995|   37163900|    193.369995|     2013|
|2014|        287.670013|291.420013|        280.399994|286.040009|   32681700|    286.040009|     2014|
|2015|        280.200012|286.649994|        276.299988| 282.26001|   15649600|     282.26001|     2015|
|2016|        266.450012|269.339996|254.50999500000003|265.420013|   23742400|    265.420013|     2016|
+----+------------------+----------+------------------+----------+-----------+--------------+---------+



In [97]:
max_df = yeardf.groupBy('Year').max()
max_df.select('Year','max(High)').show()

+----+----------+
|Year| max(High)|
+----+----------+
|2013|     194.5|
|2014|291.420013|
|2015|286.649994|
|2016|269.339996|
+----+----------+

