In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LR').getOrCreate()

In [2]:
df = spark.read.csv('walmart_stock.csv', header=True, inferSchema=True)

In [3]:
df.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']

In [4]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [8]:
df.head(5)

[Row(Date='2012-01-03', Open=59.970001, High=61.060001, Low=59.869999, Close=60.330002, Volume=12668800, Adj Close=52.619234999999996),
 Row(Date='2012-01-04', Open=60.209998999999996, High=60.349998, Low=59.470001, Close=59.709998999999996, Volume=9593300, Adj Close=52.078475),
 Row(Date='2012-01-05', Open=59.349998, High=59.619999, Low=58.369999, Close=59.419998, Volume=12768200, Adj Close=51.825539),
 Row(Date='2012-01-06', Open=59.419998, High=59.450001, Low=58.869999, Close=59.0, Volume=8069400, Adj Close=51.45922),
 Row(Date='2012-01-09', Open=59.029999, High=59.549999, Low=58.919998, Close=59.18, Volume=6679300, Adj Close=51.616215000000004)]

In [9]:
df.describe().toPandas()

Unnamed: 0,summary,Date,Open,High,Low,Close,Volume,Adj Close
0,count,1258,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
1,mean,,72.35785375357709,72.83938807631165,71.9186009594594,72.38844998012726,8222093.481717011,67.23883848728146
2,stddev,,6.76809024470826,6.768186808159218,6.744075756255496,6.756859163732991,4519780.8431556,6.722609449996857
3,min,2012-01-03,56.389999,57.060001,56.299999,56.419998,2094900.0,50.363689
4,max,2016-12-30,90.800003,90.970001,89.25,90.470001,80898100.0,84.91421600000001


In [13]:
from pyspark.sql import functions as f


In [72]:
df=df.withColumn("Open", f.format_number("Open", 2))
df=df.withColumn("Low", f.format_number("Low", 2))
df=df.withColumn("High", f.format_number("High", 2))
df=df.withColumn("Adj Close", f.format_number("Adj Close", 2))
df=df.withColumn("Close", f.format_number("Close", 2))

In [76]:
df.head()

Row(Date='2012-01-03', Open='59.97', High='61.06', Low='59.87', Close='60.33', Volume=12668800, Adj Close='52.62')

In [83]:
df1=df.withColumn('HV ratio',df['High']/df['Volume'])


In [84]:
df1.show()

+----------+-----+-----+-----+-----+--------+---------+--------------------+
|      Date| Open| High|  Low|Close|  Volume|Adj Close|            HV ratio|
+----------+-----+-----+-----+-----+--------+---------+--------------------+
|2012-01-03|59.97|61.06|59.87|60.33|12668800|    52.62|4.819714574387472E-6|
|2012-01-04|60.21|60.35|59.47|59.71| 9593300|    52.08|6.290848821573389...|
|2012-01-05|59.35|59.62|58.37|59.42|12768200|    51.83|4.669413073103491E-6|
|2012-01-06|59.42|59.45|58.87|59.00| 8069400|    51.46|7.367338339901356E-6|
|2012-01-09|59.03|59.55|58.92|59.18| 6679300|    51.62|8.915604928660188E-6|
|2012-01-10|59.43|59.71|58.98|59.04| 6907300|    51.49|8.644477581688938E-6|
|2012-01-11|59.06|59.53|59.04|59.40| 6365600|    51.81| 9.35182857861003E-6|
|2012-01-12|59.79|60.00|59.40|59.50| 7236400|    51.90| 8.29141562102703E-6|
|2012-01-13|59.18|59.61|59.01|59.54| 7729300|    51.93|7.712211972623653E-6|
|2012-01-17|59.87|60.11|59.52|59.85| 8500000|    52.20|7.071764705882352...|

In [15]:
res=df.select(['Date','High']).groupBy('Date').max()
res.orderBy(res['max(High)'].desc()).show()

+----------+-----------------+
|      Date|        max(High)|
+----------+-----------------+
|2015-01-13|        90.970001|
|2015-01-08|90.66999799999999|
|2015-01-09|        90.389999|
|2015-01-12|        90.309998|
|2015-01-23|        89.260002|
|2015-01-26|        89.160004|
|2015-01-07|            88.68|
|2015-01-14|        88.519997|
|2015-01-27|        88.459999|
|2015-01-22|        88.400002|
|2015-01-28|        88.230003|
|2014-11-28|        88.089996|
|2015-02-06|             88.0|
|2015-01-15|        87.779999|
|2015-01-29|        87.720001|
|2015-01-20|        87.699997|
|2015-01-16|        87.459999|
|2014-12-31|        87.440002|
|2015-02-10|        87.410004|
|2015-02-05|        87.360001|
+----------+-----------------+
only showing top 20 rows



In [18]:
df.select("Close").summary("mean").show()

+-------+-----------------+
|summary|            Close|
+-------+-----------------+
|   mean|72.38844998012726|
+-------+-----------------+



In [19]:
df.select("Volume").summary("max","min").show()

+-------+--------+
|summary|  Volume|
+-------+--------+
|    max|80898100|
|    min| 2094900|
+-------+--------+



In [3]:
df.createOrReplaceTempView('walmart')

In [6]:
spark.sql('SELECT * from walmart WHERE Close < 60').count()

81

In [10]:
spark.sql('SELECT * from walmart WHERE High > 80').count()*100/df.count()

9.141494435612083

In [11]:
df.corr('High','Volume')

-0.3384326061737161

In [15]:
df.select(f.year('Date').alias('year')).collect()

[Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(Date)=2012),
 Row(year(D

In [20]:
df1=df.withColumn('year',f.year(df['Date']).alias('year'))
df1=df1.withColumn('month',f.month(df['Date']).alias('month'))

In [35]:
res=df1.select(['year','High']).groupBy('year').max()
res.orderBy(res['year'].desc()).show()

+----+---------+---------+
|year|max(year)|max(High)|
+----+---------+---------+
|2016|     2016|75.190002|
|2015|     2015|90.970001|
|2014|     2014|88.089996|
|2013|     2013|81.370003|
|2012|     2012|77.599998|
+----+---------+---------+



In [None]:
res=df1.select(['month','Close']).groupBy('month').mean()
res.orderBy(res['month'].desc()).show()