# Initialize Spark Session and Read in Dataset

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.1.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
# Load in the Postgress Driver
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

In [None]:
# Import Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [None]:
from pyspark import SparkFiles
# Load in employee.csv from S3 into a DataFrame
## CHANGE TO OUR S3 BUCKET:  url = "https://<bucket name>.s3.amazonaws.com/employee.csv"
spark.sparkContext.addFile(url)

## CHANGE CSV NAME:  NYSE_df = spark.read.option('header', 'true').csv(SparkFiles.get("NYSE_Data.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")


# Work around

In [2]:
# Load the data
NYSE_df = pd.read_csv('Dataset/CSV/Final Output.csv', index_col=False)

# Data Preprocessing

In [3]:
NYSE_df.head(10)
print(NYSE_df.count())

Index                 13947
Date                  13947
Month                 13947
Quarter               13947
Open                  13947
High                  13947
Low                   13947
Close                 13947
Adj Close             13947
Volume                13947
CloseUSD              13947
LOCATION              13947
INDICATOR             13947
SUBJECT               13947
MEASURE               13947
FREQUENCY             13947
Value                 13947
Flag Codes                0
observation_date      13947
GDPC1                 13947
observation_date.1    13947
LFWA64TTUSM647S       13947
dtype: int64


In [4]:
# Column Types
NYSE_df.dtypes

Index                  object
Date                   object
Month                  object
Quarter                object
Open                  float64
High                  float64
Low                   float64
Close                 float64
Adj Close             float64
Volume                  int64
CloseUSD              float64
LOCATION               object
INDICATOR              object
SUBJECT                object
MEASURE                object
FREQUENCY              object
Value                 float64
Flag Codes            float64
observation_date       object
GDPC1                 float64
observation_date.1     object
LFWA64TTUSM647S       float64
dtype: object

In [5]:
# Drop null and uncesessary columns
NYSE_df = NYSE_df.drop(columns=['Flag Codes','Index','Month','Quarter','Open','High','Low','Close','Adj Close','Volume','LOCATION','INDICATOR','SUBJECT','MEASURE','FREQUENCY','observation_date','observation_date.1'])

# Drop blank rows
NYSE_df = NYSE_df.dropna()

In [6]:
# Rename Columns
NYSE_df = NYSE_df.rename(columns={'CloseUSD':'NYA_Close_Price','Value':'Inflation','GDPC1':'Real_GDP','LFWA64TTUSM647S':'Working_Pop'})


In [7]:
NYSE_df.head()

Unnamed: 0,Date,NYA_Close_Price,Inflation,Real_GDP,Working_Pop
0,12/31/1965,528.690002,13.41676,4304.73,77178000.0
1,1/3/1966,527.210022,13.41676,4409.52,77178000.0
2,1/4/1966,527.840027,13.41676,4409.52,77178000.0
3,1/5/1966,531.119995,13.41676,4409.52,77178000.0
4,1/6/1966,532.070007,13.41676,4409.52,77178000.0


In [8]:
# Convert Date to Datetime
NYSE_df['Date'] = (pd.to_datetime(NYSE_df['Date'], infer_datetime_format=True) - dt.datetime(1970,1,1)).dt.total_seconds()

# Divide Inflation by 100
NYSE_df['Inflation'] = NYSE_df['Inflation'] / 100

# Real GDP * 1B
NYSE_df['Real_GDP'] = NYSE_df['Real_GDP'] * 1000000000

# Calculate Real GDP per Capita from GDP and Population
NYSE_df['Real_GDP_per_Capita'] = NYSE_df['Real_GDP'] / NYSE_df['Working_Pop']

# Drop Real_GDP column
NYSE_df = NYSE_df.drop(columns=['Real_GDP'])


In [10]:
NYSE_df.set_index('Date', inplace=True)
NYSE_df.head()

Unnamed: 0_level_0,NYA_Close_Price,Inflation,Working_Pop,Real_GDP_per_Capita
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-126316800.0,528.690002,0.134168,77178000.0,55776.646195
-126057600.0,527.210022,0.134168,77178000.0,57134.416544
-125971200.0,527.840027,0.134168,77178000.0,57134.416544
-125884800.0,531.119995,0.134168,77178000.0,57134.416544
-125798400.0,532.070007,0.134168,77178000.0,57134.416544


# Split Data into Training and Testing

In [11]:
# Create our features
X = NYSE_df.drop('NYA_Close_Price', axis=1)

# Create our target
y = NYSE_df['NYA_Close_Price']

X.head()

Unnamed: 0_level_0,Inflation,Working_Pop,Real_GDP_per_Capita
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-126316800.0,0.134168,77178000.0,55776.646195
-126057600.0,0.134168,77178000.0,57134.416544
-125971200.0,0.134168,77178000.0,57134.416544
-125884800.0,0.134168,77178000.0,57134.416544
-125798400.0,0.134168,77178000.0,57134.416544


In [12]:
# Check features
X.describe()

Unnamed: 0,Inflation,Working_Pop,Real_GDP_per_Capita
count,13947.0,13947.0,13947.0
mean,0.600693,158333900.0,66646.78636
std,0.302497,40874200.0,13220.003186
min,0.134168,77178000.0,44850.67294
25%,0.323606,142022600.0,57815.050921
50%,0.612192,164584900.0,62365.333919
75%,0.878836,195847000.0,78420.443051
max,1.135762,207311600.0,94220.229858


In [13]:
# Check the balance of our target values
y.value_counts()

569.919983     11
586.630005      9
567.599976      8
555.020020      8
580.710022      7
               ..
6838.450195     1
1434.010010     1
1174.420044     1
960.619995      1
7992.009766     1
Name: NYA_Close_Price, Length: 11590, dtype: int64

In [14]:
# Split into training and testing 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    random_state=1
                                                )

In [15]:
# Initialize the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [16]:
# Train the data
model.fit(X_train, y_train)

LinearRegression()

In [57]:
# Calculate the model accuracy score (1 is perfect prediction)
from sklearn.metrics import r2_score, mean_squared_error

y_pred = model.predict(X_test)

r2_score = r2_score(y_test, y_pred)
print(f'Coefficient of determination: {r2_score*100:.2f} %')

Coefficient of determination: 94.45 %


In [64]:
# The intercept
print("Intercept: %.2f" % model.intercept_)
# The coefficients
print("Coefficients: \n", model.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

Intercept: -13051.71
Coefficients: 
 [3.00164614e+03 1.65137912e-05 1.96341244e-01]
Mean squared error: 918646.97


# Export 

In [66]:
Model_df = pd.DataFrame(X_test,y_pred)
Model_df

Unnamed: 0,Inflation,Working_Pop,Real_GDP_per_Capita
558.173420,,,
8006.517196,,,
-302.781552,,,
11546.250664,,,
2799.151485,,,
...,...,...,...
5873.899440,,,
8541.466682,,,
-959.425261,,,
544.052190,,,
