In [1]:
import boto3
import pandas as pd

client=boto3.client('s3')

In [2]:
with open('aws.txt')as f:
    lines=f.read().splitlines()

In [3]:
from boto.s3.connection import S3Connection
conn = S3Connection(lines[0],lines[1], host='s3.ap-south-1.amazonaws.com')

In [4]:
path='s3://historicaldata03jun2020/HistoricalQuotes.csv'
data=pd.read_csv(path)

In [5]:
data.head(2)

Unnamed: 0,Date,Close/Last,Volume,Open,High,Low
0,06-01-20,"$1,431.82",1217140,"$1,418.39","$1,437.96","$1,418"
1,05/29/2020,"$1,428.92",1838059,"$1,416.94","$1,432.57","$1,413.35"


In [7]:
data.rename(columns={c:c.strip() for c in data.columns.values.tolist()},inplace=True)

In [8]:
data.rename(columns={"Close/Last":"Close"},inplace=True)
data.columns.tolist()

['Date', 'Close', 'Volume', 'Open', 'High', 'Low']

In [None]:
import numpy as np
data['Close']=data['Close'].str.replace('\D+','').astype(int)
data['Open']=data['Open'].str.replace('\D+','').astype(int)
data['High']=data['High'].str.replace('\D+','').astype(int)
data['Low']=data['Low'].str.replace('\D+','').astype(int)
print(data.head(2))

In [22]:
print(data.dtypes)

Date      object
Close      int64
Volume     int64
Open       int64
High       int64
Low        int64
dtype: object


In [23]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession

In [25]:
sc.stop()

In [26]:
sc=SparkContext()
spark=SparkSession(sc)
stockData=spark.createDataFrame(data)

In [27]:
stockData.take(5)

[Row(Date='06-01-20', Close=143182, Volume=1217140, Open=141839, High=143796, Low=1418),
 Row(Date='05/29/2020', Close=142892, Volume=1838059, Open=141694, High=143257, Low=141335),
 Row(Date='05/28/2020', Close=141673, Volume=1693976, Open=139686, High=144084, Low=1396),
 Row(Date='05/27/2020', Close=141784, Volume=1686142, Open=141725, High=142174, Low=139129),
 Row(Date='05/26/2020', Close=141702, Volume=2060643, Open=143727, High=1441, Low=141213)]

In [28]:
stockData.cache()

DataFrame[Date: string, Close: bigint, Volume: bigint, Open: bigint, High: bigint, Low: bigint]

In [30]:
stockData=stockData.withColumn('Date',stockData['Date'].cast('string'))
stockData.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
Date,1259,,,01-02-18,12/31/2019
Close,1259,95773.87609213662,27017.787420701774,628,152669
Volume,1259,1716753.3145353454,860209.0458188738,347518,11153500
Open,1259,83529.77839555203,40008.59048192892,528,152507
High,1259,90095.80063542494,35307.13089109199,540,153211
Low,1259,88408.50754567116,35033.04109303526,519,152140


In [32]:
trainData=stockData.select(['Open','High','Low','Close'])
trainData.show(5)

+------+------+------+------+
|  Open|  High|   Low| Close|
+------+------+------+------+
|141839|143796|  1418|143182|
|141694|143257|141335|142892|
|139686|144084|  1396|141673|
|141725|142174|139129|141784|
|143727|  1441|141213|141702|
+------+------+------+------+
only showing top 5 rows



# Transformation

In [34]:
from pyspark.ml.feature import VectorAssembler
VectorAssembler = VectorAssembler(inputCols=['Open','High','Low'],outputCol='features')

In [35]:
VectorDataFrame=VectorAssembler.transform(trainData)
VectorDataFrame=VectorDataFrame.select(['features','Close'])
VectorDataFrame.show()

+--------------------+------+
|            features| Close|
+--------------------+------+
|[141839.0,143796....|143182|
|[141694.0,143257....|142892|
|[139686.0,144084....|141673|
|[141725.0,142174....|141784|
|[143727.0,1441.0,...|141702|
|[139671.0,141276....|141042|
|[1408.0,141549.0,...|140280|
|[138958.0,141042....|140672|
|[138700.0,1392.0,...|137349|
|[136175.0,139233....|138394|
|[1350.0,137448.0,...|137319|
|[133502.0,135742....|135613|
|[137705.0,138548....|134933|
|[140712.0,1415.0,...|137574|
|[137828.0,141653....|140326|
|[138313.0,139876....|138837|
|[136594.0,137760....|137256|
|[136169.0,137112....|134730|
|[133792.0,137394....|135111|
|[130823.0,132766....|132680|
+--------------------+------+
only showing top 20 rows



In [36]:
from pyspark.ml.regression import LinearRegression
LinearRegression=LinearRegression(featuresCol='features',labelCol='Close',maxIter=10,regParam=0.3,elasticNetParam=0.8)

In [37]:
model=LinearRegression.fit(VectorDataFrame)

In [39]:
SplitsTraingTesting=VectorDataFrame.randomSplit([0.7,0.3])
testDataframe=SplitsTraingTesting[1]

In [40]:
Prediction=model.transform(testDataframe)

In [41]:
Prediction.select("Prediction","Close","features").show()

+------------------+-----+--------------------+
|        Prediction|Close|            features|
+------------------+-----+--------------------+
| 54027.72396981865|52720|[528.0,52830.0,52...|
| 71878.95696091943|60070|[600.0,60347.0,59...|
|   73450.432998404|63259|[630.0,63522.0,62...|
| 57134.14577428863|64147|[632.0,64301.0,62...|
| 74095.62906886946|64361|[640.0,64599.0,63...|
| 73811.13889703371|62356|[647.0,64817.0,62...|
| 76159.30922852141|68311|[675.0,68935.0,66...|
| 77506.87541975155|69735|[710.0,71235.0,69...|
| 81469.88150246203|77686|[779.0,78048.0,77...|
| 81547.87970051795|78035|[780.0,78273.0,77...|
|  81802.7826182264|78929|[780.0,78943.0,77...|
| 61381.18956780512|79937|[795.0,79950.0,79...|
| 83585.34895100097|81758|[821.0,82257.0,81...|
| 85117.97185977698|84554|[844.0,84869.0,84...|
|64280.255679924885|90666|[910.0,913.0,9034...|
| 89411.52232807846|93024|[921.0,93317.0,91...|
|65259.122102196685|94049|[929.0,94275.0,91...|
| 89638.91271308047|93245|[933.0,93653.0

In [51]:
from pyspark.ml.evaluation import RegressionEvaluator
modelEvaluator=RegressionEvaluator(predictionCol='Prediction',labelCol='Close',metricName='r2')
print(Prediction)

DataFrame[features: vector, Close: bigint, prediction: double]


In [53]:
print("coefficients : "+ str(model.coefficients))
print("Intercept  : "+ str(model.intercept))

trainingSummary=model.summary
print("RMSE: %f"%trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

coefficients : [0.09870969657180231,0.26750198380557855,0.26835669701667886]
Intercept  : 39702.856536343286
RMSE: 20945.434072
r2: 0.398516


In [None]:
import sys
model.save()