<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Spark-DataFrame-Basics" data-toc-modified-id="Spark-DataFrame-Basics-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Spark DataFrame Basics</a></span></li><li><span><a href="#Create-spark-dataframe-from-data-file" data-toc-modified-id="Create-spark-dataframe-from-data-file-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Create spark dataframe from data file</a></span></li><li><span><a href="#Read-the-data-with-custom-schema" data-toc-modified-id="Read-the-data-with-custom-schema-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Read the data with custom schema</a></span></li><li><span><a href="#Create-new-column" data-toc-modified-id="Create-new-column-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Create new column</a></span></li><li><span><a href="#Rename-column" data-toc-modified-id="Rename-column-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Rename column</a></span></li><li><span><a href="#Using-SQL-in-pyspark" data-toc-modified-id="Using-SQL-in-pyspark-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Using SQL in pyspark</a></span></li><li><span><a href="#Spark-Basic-Operations" data-toc-modified-id="Spark-Basic-Operations-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Spark Basic Operations</a></span></li><li><span><a href="#GroupBy" data-toc-modified-id="GroupBy-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>GroupBy</a></span></li><li><span><a href="#Missing-Data" data-toc-modified-id="Missing-Data-9"><span class="toc-item-num">9&nbsp;&nbsp;</span>Missing Data</a></span></li><li><span><a href="#Date-and-Times" data-toc-modified-id="Date-and-Times-10"><span class="toc-item-num">10&nbsp;&nbsp;</span>Date and Times</a></span><ul class="toc-item"><li><span><a href="#average-closing-price-per-year" data-toc-modified-id="average-closing-price-per-year-10.1"><span class="toc-item-num">10.1&nbsp;&nbsp;</span>average closing price per year</a></span></li></ul></li></ul></div>

# Spark DataFrame Basics

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

print([(x.__name__,x.__version__) for x in [np, pd, pyspark]])

spark = pyspark.sql.SparkSession.builder.appName('Basics').getOrCreate()

sc = spark.sparkContext
sqlContext = SQLContext(sc)

sc.setLogLevel("INFO")

[('numpy', '1.17.1'), ('pandas', '0.25.1'), ('pyspark', '2.4.4')]


# Create spark dataframe from data file

In [18]:
!cat ../data/people.json

{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}


In [2]:
df = spark.read.json('../data/people.json')

In [3]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [4]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [5]:
df.columns

['age', 'name']

In [6]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [7]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [19]:
dfp = pd.read_json('../data/people.json',lines=True)
dfp

Unnamed: 0,name,age
0,Michael,
1,Andy,30.0
2,Justin,19.0


# Read the data with custom schema

In [8]:
df.columns

['age', 'name']

In [9]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [12]:
data_schema = [StructField('age', IntegerType(), True),
               StructField('name', StringType(), True)]

In [13]:
final_struc = StructType(fields=data_schema)

In [14]:
df = spark.read.json('../data/people.json', schema=final_struc)

In [15]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [16]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [20]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [21]:
dfp['age']

0     NaN
1    30.0
2    19.0
Name: age, dtype: float64

In [22]:
df.head(2)[0]

Row(age=None, name='Michael')

In [25]:
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [27]:
df.select(['age','name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# Create new column

In [30]:
df.withColumn('newage',df['age']+1).show() # this is view, not inplace operation.

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    31|
|  19| Justin|    20|
+----+-------+------+



In [29]:
dfp['newage'] = dfp['age'] + 1 # pandas operation is inplace
dfp

Unnamed: 0,name,age,newage
0,Michael,,
1,Andy,30.0,31.0
2,Justin,19.0,20.0


# Rename column

In [33]:
df.withColumnRenamed('age','my_new_age').show()

+----------+-------+
|my_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



In [34]:
dfp.rename(columns={'age': 'my_new_age'})

Unnamed: 0,name,my_new_age,newage
0,Michael,,
1,Andy,30.0,31.0
2,Justin,19.0,20.0


# Using SQL in pyspark

In [38]:
df.createOrReplaceTempView("people") # this is sql temporary view

In [42]:
results = spark.sql("select * from people where age = 30")

In [40]:
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# Spark Basic Operations

In [47]:
!head -2 ../data/appl_stock.csv

Date,Open,High,Low,Close,Volume,Adj Close
2010-01-04,213.429998,214.499996,212.38000099999996,214.009998,123432400,27.727039


In [44]:
from pyspark.sql import SparkSession

In [45]:
spark = SparkSession.builder.appName('ops').getOrCreate()

In [48]:
df = spark.read.csv('../data/appl_stock.csv', inferSchema=True, header=True)

In [60]:
dfp = pd.read_csv('../data/appl_stock.csv')
print(dfp.shape)
dfp.head()

(1762, 7)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2010-01-04,213.429998,214.499996,212.380001,214.009998,123432400,27.727039
1,2010-01-05,214.599998,215.589994,213.249994,214.379993,150476200,27.774976
2,2010-01-06,214.379993,215.23,210.750004,210.969995,138040000,27.333178
3,2010-01-07,211.75,212.000006,209.050005,210.58,119282800,27.28265
4,2010-01-08,210.299994,212.000006,209.060005,211.980005,111902700,27.464034


In [49]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [51]:
df.show(2)

+-------------------+----------+----------+------------------+----------+---------+------------------+
|               Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|
+-------------------+----------+----------+------------------+----------+---------+------------------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|
+-------------------+----------+----------+------------------+----------+---------+------------------+
only showing top 2 rows



In [52]:
df.head(2)[0]

Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)

In [57]:
# df.toPandas()

In [58]:
df.limit(2).toPandas()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2010-01-04,213.429998,214.499996,212.380001,214.009998,123432400,27.727039
1,2010-01-05,214.599998,215.589994,213.249994,214.379993,150476200,27.774976


In [63]:
df.filter("Close < 500").select('Open').limit(5).show()

+----------+
|      Open|
+----------+
|213.429998|
|214.599998|
|214.379993|
|    211.75|
|210.299994|
+----------+



In [62]:
dfp.query("Close < 500")['Open'].head()

0    213.429998
1    214.599998
2    214.379993
3    211.750000
4    210.299994
Name: Open, dtype: float64

In [64]:
df.filter( (df.Close <200) & (df.Open > 200)).limit(5).show()

+-------------------+------------------+----------+----------+----------+---------+------------------+
|               Date|              Open|      High|       Low|     Close|   Volume|         Adj Close|
+-------------------+------------------+----------+----------+----------+---------+------------------+
|2010-01-22 00:00:00|206.78000600000001|207.499996|    197.16|    197.75|220441900|         25.620401|
|2010-01-28 00:00:00|        204.930004|205.500004|198.699995|199.289995|293375600|25.819922000000002|
|2010-01-29 00:00:00|        201.079996|202.199995|190.250002|192.060003|311488100|         24.883208|
+-------------------+------------------+----------+----------+----------+---------+------------------+



In [66]:
dfp.query("Close <200 and Open > 200").head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
13,2010-01-22,206.780006,207.499996,197.16,197.75,220441900,25.620401
17,2010-01-28,204.930004,205.500004,198.699995,199.289995,293375600,25.819922
18,2010-01-29,201.079996,202.199995,190.250002,192.060003,311488100,24.883208


In [68]:
res = df.filter(df['Close'] < 200).limit(2).collect()
res

[Row(Date=datetime.datetime(2010, 1, 22, 0, 0), Open=206.78000600000001, High=207.499996, Low=197.16, Close=197.75, Volume=220441900, Adj Close=25.620401),
 Row(Date=datetime.datetime(2010, 1, 28, 0, 0), Open=204.930004, High=205.500004, Low=198.699995, Close=199.289995, Volume=293375600, Adj Close=25.819922000000002)]

In [70]:
res[0].asDict()

{'Date': datetime.datetime(2010, 1, 22, 0, 0),
 'Open': 206.78000600000001,
 'High': 207.499996,
 'Low': 197.16,
 'Close': 197.75,
 'Volume': 220441900,
 'Adj Close': 25.620401}

# GroupBy

In [71]:
spark = SparkSession.builder.appName('aggs').getOrCreate()

In [72]:
df = spark.read.csv('../data/sales_info.csv', inferSchema=True, header=True)

In [77]:
dfp = pd.read_csv('../data/sales_info.csv')
print(dfp.shape)
dfp

(12, 3)


Unnamed: 0,Company,Person,Sales
0,GOOG,Sam,200
1,GOOG,Charlie,120
2,GOOG,Frank,340
3,MSFT,Tina,600
4,MSFT,Amy,124
5,MSFT,Vanessa,243
6,FB,Carl,870
7,FB,Sarah,350
8,APPL,John,250
9,APPL,Linda,130


In [73]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [74]:
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



In [76]:
df.groupBy("Company").mean().show()

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [78]:
dfp.groupby('Company').mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
APPL,370.0
FB,610.0
GOOG,220.0
MSFT,322.333333


In [82]:
df.agg({'Sales': 'max'}).show()

+----------+
|max(Sales)|
+----------+
|     870.0|
+----------+



In [83]:
dfp['Sales'].max()

870

In [80]:
gp = df.groupBy("Company")

In [81]:
gp.agg({'Sales': 'max'}).show()

+-------+----------+
|Company|max(Sales)|
+-------+----------+
|   APPL|     750.0|
|   GOOG|     340.0|
|     FB|     870.0|
|   MSFT|     600.0|
+-------+----------+



In [84]:
dfp.groupby('Company')['Sales'].max()

Company
APPL    750
FB      870
GOOG    340
MSFT    600
Name: Sales, dtype: int64

In [85]:
dfp.groupby('Company').agg({'Sales': 'max'})

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
APPL,750
FB,870
GOOG,340
MSFT,600


In [86]:
from pyspark.sql import functions as F

In [87]:
df.select(F.countDistinct('Sales')).show()

+---------------------+
|count(DISTINCT Sales)|
+---------------------+
|                   11|
+---------------------+



In [88]:
from pyspark.sql.functions import countDistinct, avg, stddev

In [89]:
df.select(avg('Sales').alias('Average_Sales')).show()

+-----------------+
|    Average_Sales|
+-----------------+
|360.5833333333333|
+-----------------+



In [90]:
from pyspark.sql.functions import format_number

In [91]:
sales_std = df.select(stddev("Sales").alias("std"))

In [95]:
sales_std.select(format_number('std',2).alias('std')).show()

+------+
|   std|
+------+
|250.09|
+------+



In [94]:
dfp.Sales.std().round(2)

250.09

In [96]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [97]:
df.orderBy("Sales").show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|Charlie|120.0|
|   MSFT|    Amy|124.0|
|   APPL|  Linda|130.0|
|   GOOG|    Sam|200.0|
|   MSFT|Vanessa|243.0|
|   APPL|   John|250.0|
|   GOOG|  Frank|340.0|
|     FB|  Sarah|350.0|
|   APPL|  Chris|350.0|
|   MSFT|   Tina|600.0|
|   APPL|   Mike|750.0|
|     FB|   Carl|870.0|
+-------+-------+-----+



# Missing Data

In [98]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('miss').getOrCreate()

df = spark.read.csv('../data/ContainsNull.csv',inferSchema=True,header=True)
df.show()

In [103]:
dfp = pd.read_csv('../data/ContainsNull.csv')
print(dfp.shape)
dfp

(4, 3)


Unnamed: 0,Id,Name,Sales
0,emp1,John,
1,emp2,,
2,emp3,,345.0
3,emp4,Cindy,456.0


In [108]:
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [109]:
dfp.dropna(thresh=2)

Unnamed: 0,Id,Name,Sales
0,emp1,John,
2,emp3,,345.0
3,emp4,Cindy,456.0


In [105]:
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [107]:
dfp.dropna(how='all')

Unnamed: 0,Id,Name,Sales
0,emp1,John,
1,emp2,,
2,emp3,,345.0
3,emp4,Cindy,456.0


In [110]:
df.na.drop(subset=['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [111]:
dfp.dropna(subset=['Sales'])

Unnamed: 0,Id,Name,Sales
2,emp3,,345.0
3,emp4,Cindy,456.0


In [112]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [113]:
df.na.fill('FILL VALUE').show()

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL VALUE| null|
|emp3|FILL VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [116]:
dfp.fillna('FILL') # ** differet !!

Unnamed: 0,Id,Name,Sales
0,emp1,John,FILL
1,emp2,FILL,FILL
2,emp3,FILL,345
3,emp4,Cindy,456


In [117]:
dfp.select_dtypes('object').fillna('FILL')

Unnamed: 0,Id,Name
0,emp1,John
1,emp2,FILL
2,emp3,FILL
3,emp4,Cindy


In [118]:
df.na.fill('NO NAME', subset=['Name']).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|NO NAME| null|
|emp3|NO NAME|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [119]:
from pyspark.sql.functions import mean

In [120]:
mean_val = df.select(mean(df['Sales'])).collect()
mean_val[0]

Row(avg(Sales)=400.5)

In [121]:
mean_sales = mean_val[0][0]

In [122]:
df.na.fill(mean_sales, ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [124]:
dfp.fillna({'Sales': dfp.Sales.mean()})

Unnamed: 0,Id,Name,Sales
0,emp1,John,400.5
1,emp2,,400.5
2,emp3,,345.0
3,emp4,Cindy,456.0


In [125]:
dfp.fillna(dfp.mean())

Unnamed: 0,Id,Name,Sales
0,emp1,John,400.5
1,emp2,,400.5
2,emp3,,345.0
3,emp4,Cindy,456.0


In [134]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')

imputer.fit_transform(dfp['Sales'].to_numpy().reshape(-1,1)) # sklern needs 2d array

array([[400.5],
       [400.5],
       [345. ],
       [456. ]])

# Date and Times

In [136]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('dates').getOrCreate()

df = spark.read.csv('../data/appl_stock.csv',inferSchema=True,header=True)
df.limit(1).show()

+-------------------+----------+----------+------------------+----------+---------+---------+
|               Date|      Open|      High|               Low|     Close|   Volume|Adj Close|
+-------------------+----------+----------+------------------+----------+---------+---------+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|214.009998|123432400|27.727039|
+-------------------+----------+----------+------------------+----------+---------+---------+



In [137]:
dfp = pd.read_csv('../data/appl_stock.csv')
print(dfp.shape)
dfp.head()

(1762, 7)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2010-01-04,213.429998,214.499996,212.380001,214.009998,123432400,27.727039
1,2010-01-05,214.599998,215.589994,213.249994,214.379993,150476200,27.774976
2,2010-01-06,214.379993,215.23,210.750004,210.969995,138040000,27.333178
3,2010-01-07,211.75,212.000006,209.050005,210.58,119282800,27.28265
4,2010-01-08,210.299994,212.000006,209.060005,211.980005,111902700,27.464034


In [138]:
from pyspark.sql.functions import (dayofmonth, hour, dayofyear,
                                  month, year, weekofyear, format_number,
                                  date_format)

In [139]:
df.select(dayofmonth(df['Date'])).show(2)

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
+----------------+
only showing top 2 rows



## average closing price per year

In [140]:
df2 = df.withColumn("Year", year(df.Date))
df2.show(2)

+-------------------+----------+----------+------------------+----------+---------+------------------+----+
|               Date|      Open|      High|               Low|     Close|   Volume|         Adj Close|Year|
+-------------------+----------+----------+------------------+----------+---------+------------------+----+
|2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|214.009998|123432400|         27.727039|2010|
|2010-01-05 00:00:00|214.599998|215.589994|        213.249994|214.379993|150476200|27.774976000000002|2010|
+-------------------+----------+----------+------------------+----------+---------+------------------+----+
only showing top 2 rows



In [158]:
result = df2.groupBy("Year").mean().select(["Year","avg(Close)"])
result.show()

+----+------------------+
|Year|        avg(Close)|
+----+------------------+
|2015|120.03999980555547|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2012| 576.0497195640002|
|2016|104.60400786904763|
|2010| 259.8424600000002|
|2011|364.00432532142867|
+----+------------------+



In [145]:
dfp.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Volume         int64
Adj Close    float64
dtype: object

In [146]:
dfp['Date'] = pd.to_datetime(dfp['Date'])

In [148]:
dfp.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2010-01-04,213.429998,214.499996,212.380001,214.009998,123432400,27.727039
1,2010-01-05,214.599998,215.589994,213.249994,214.379993,150476200,27.774976
2,2010-01-06,214.379993,215.23,210.750004,210.969995,138040000,27.333178
3,2010-01-07,211.75,212.000006,209.050005,210.58,119282800,27.28265
4,2010-01-08,210.299994,212.000006,209.060005,211.980005,111902700,27.464034


In [156]:
dfp.groupby(dfp.Date.dt.year).Close.mean()

Date
2010    259.842460
2011    364.004325
2012    576.049720
2013    472.634880
2014    295.402342
2015    120.040000
2016    104.604008
Name: Close, dtype: float64

In [159]:
result.show()

+----+------------------+
|Year|        avg(Close)|
+----+------------------+
|2015|120.03999980555547|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2012| 576.0497195640002|
|2016|104.60400786904763|
|2010| 259.8424600000002|
|2011|364.00432532142867|
+----+------------------+



In [160]:
result.select('Year', format_number('avg(Close)',2).alias("Avg_Close")).show()

+----+---------+
|Year|Avg_Close|
+----+---------+
|2015|   120.04|
|2013|   472.63|
|2014|   295.40|
|2012|   576.05|
|2016|   104.60|
|2010|   259.84|
|2011|   364.00|
+----+---------+

