# Initialize Spark Session and Read in Dataset

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.1.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
# Load in the Postgress Driver
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

In [None]:
# Import Spark Session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [None]:
from pyspark import SparkFiles
# Load in employee.csv from S3 into a DataFrame
## CHANGE TO OUR S3 BUCKET:  url = "https://<bucket name>.s3.amazonaws.com/employee.csv"
spark.sparkContext.addFile(url)

## CHANGE CSV NAME:  NYSE_df = spark.read.option('header', 'true').csv(SparkFiles.get("NYSE_Data.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")


# Work around

In [2]:
# Load the data
NYSE_df = pd.read_csv('Dataset/CSV/Final Output.csv')

# Data Preprocessing

In [3]:
NYSE_df.head(10)
print(NYSE_df.count())

Index                 13947
Date                  13947
Month                 13947
Quarter               13947
Open                  13947
High                  13947
Low                   13947
Close                 13947
Adj Close             13947
Volume                13947
CloseUSD              13947
LOCATION              13947
INDICATOR             13947
SUBJECT               13947
MEASURE               13947
FREQUENCY             13947
Value                 13947
Flag Codes                0
observation_date      13947
GDPC1                 13947
observation_date.1    13947
LFWA64TTUSM647S       13947
dtype: int64


In [4]:
# Drop null and uncesessary columns
NYSE_df = NYSE_df.drop(columns=['Flag Codes','Index','Month','Quarter','Open','High','Low','Close','Adj Close','Volume','LOCATION','INDICATOR','SUBJECT','MEASURE','FREQUENCY','observation_date','observation_date.1'])



In [5]:
# Check Table
print(NYSE_df.count())

Date               13947
CloseUSD           13947
Value              13947
GDPC1              13947
LFWA64TTUSM647S    13947
dtype: int64


In [6]:
# Rename Columns
NYSE_df = NYSE_df.rename(columns={'CloseUSD':'NYA_Close_Price','Value':'Inflation','GDPC1':'Real_GDP','LFWA64TTUSM647S':'Working_Pop'})


In [7]:
NYSE_df.head()

Unnamed: 0,Date,NYA_Close_Price,Inflation,Real_GDP,Working_Pop
0,12/31/65,528.690002,13.41676,4304.73,77178000.0
1,1/3/66,527.210022,13.41676,4409.52,77178000.0
2,1/4/66,527.840027,13.41676,4409.52,77178000.0
3,1/5/66,531.119995,13.41676,4409.52,77178000.0
4,1/6/66,532.070007,13.41676,4409.52,77178000.0


In [8]:
# Divide Inflation by 100
NYSE_df['Inflation'] = NYSE_df['Inflation'] / 100

# GDP * 1B
NYSE_df['Real_GDP'] = NYSE_df['Real_GDP'] * 1000000000

# Calculate Real GDP per Capita from GDP and Population
NYSE_df['Real_GDP_per_Capita'] = NYSE_df['Real_GDP'] / NYSE_df['Working_Pop']

# Drop unecessary columns
NYSE_df = NYSE_df.drop(columns=['Real_GDP'])

# Drop blank rows
NYSE_df = NYSE_df.dropna()

In [9]:
NYSE_df.reset_index(inplace=True, drop=True)
NYSE_df.head()

Unnamed: 0,Date,NYA_Close_Price,Inflation,Working_Pop,Real_GDP_per_Capita
0,12/31/65,528.690002,0.134168,77178000.0,55776.646195
1,1/3/66,527.210022,0.134168,77178000.0,57134.416544
2,1/4/66,527.840027,0.134168,77178000.0,57134.416544
3,1/5/66,531.119995,0.134168,77178000.0,57134.416544
4,1/6/66,532.070007,0.134168,77178000.0,57134.416544


# Split Data into Training and Testing

In [11]:
# Create our features
X = NYSE_df.drop('NYA_Close_Price', axis=1)

# Create our target
y = NYSE_df['NYA_Close_Price']

X.head()

Unnamed: 0,Date,Inflation,Working_Pop,Real_GDP_per_Capita
0,12/31/65,0.134168,77178000.0,55776.646195
1,1/3/66,0.134168,77178000.0,57134.416544
2,1/4/66,0.134168,77178000.0,57134.416544
3,1/5/66,0.134168,77178000.0,57134.416544
4,1/6/66,0.134168,77178000.0,57134.416544


In [12]:
# Check features
X.describe()

Unnamed: 0,Inflation,Working_Pop,Real_GDP_per_Capita
count,13947.0,13947.0,13947.0
mean,0.600693,158333900.0,66646.78636
std,0.302497,40874200.0,13220.003186
min,0.134168,77178000.0,44850.67294
25%,0.323606,142022600.0,57815.050921
50%,0.612192,164584900.0,62365.333919
75%,0.878836,195847000.0,78420.443051
max,1.135762,207311600.0,94220.229858


In [13]:
# Check the balance of our target values
y.value_counts()

569.919983     11
586.630005      9
567.599976      8
555.020020      8
580.710022      7
               ..
6838.450195     1
1434.010010     1
1174.420044     1
960.619995      1
7992.009766     1
Name: NYA_Close_Price, Length: 11590, dtype: int64

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = brfc.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

# Retraining the Model (if don't achieve desired accuracy score)