# Stock EDA and Data Clean Up

### Libraries

In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,\
FloatType, TimestampType
from pyspark.sql.functions import isnan, when, count, col

### Spark Session

In [2]:
spark = SparkSession.builder.appName("stock").getOrCreate()
sc = spark.sparkContext
data_file = "gs://stock-project-sp500/Data/S&P_500_Full_Stock_Data.csv"

### Data Schema

In [3]:
stock_schema = StructType([StructField('Symbol', StringType(), False),
                           StructField('Date', TimestampType(), False),
                           StructField('Open', FloatType(), True),
                           StructField('High', FloatType(), True),
                           StructField('Low', FloatType(), True),
                           StructField('Close', FloatType(), True),
                           StructField('Adj Close', FloatType(), True),
                           StructField('Volume', IntegerType(), True),
                           StructField('Description', StringType(), False),
                           StructField('Category2', StringType(), False),
                           StructField('Category3', StringType(), False),
                           StructField('GICS Sector', StringType(), False)])

### Reading the Data

In [4]:
stock_df = spark.read.csv(data_file,
                          header = True,
                          schema = stock_schema).cache()

### Null and Missing Values

In [5]:
# Only looking at these columns because the schema defined the other columns as not nullable.
null_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

In [6]:
# Number of Null values per column
stock_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in null_columns]).show()



+----+----+---+-----+---------+------+
|Open|High|Low|Close|Adj Close|Volume|
+----+----+---+-----+---------+------+
|   7|   7|  7|    7|        7|  9569|
+----+----+---+-----+---------+------+



                                                                                

In [7]:
# Creating dataframe with the null values
agg_expression = [F.sum(when(stock_df[x].isNull(), 1).otherwise(0)).alias(x) for x in null_columns]
null_values_by_stock = stock_df.groupby("Symbol").agg(*agg_expression)

null_values_by_stock = null_values_by_stock.withColumn('Missing Values Sum', sum([F.col(c) for c in null_columns]))
null_values_by_stock.filter(null_values_by_stock["Missing Values Sum"] > 0).show()

                                                                                

+------+----+----+---+-----+---------+------+------------------+
|Symbol|Open|High|Low|Close|Adj Close|Volume|Missing Values Sum|
+------+----+----+---+-----+---------+------+------------------+
|    EA|   1|   1|  1|    1|        1|  1367|              1372|
|    CI|   1|   1|  1|    1|        1|  1367|              1372|
|   PVH|   1|   1|  1|    1|        1|  1367|              1372|
|  NLOK|   1|   1|  1|    1|        1|  1367|              1372|
|  BF-B|   1|   1|  1|    1|        1|  1367|              1372|
|   HPQ|   1|   1|  1|    1|        1|  1367|              1372|
|   AEE|   1|   1|  1|    1|        1|  1367|              1372|
+------+----+----+---+-----+---------+------+------------------+



In [19]:
stock_df_missing_values = stock_df.filter(col("Open").isNull()|col("High").isNull()\
                                         |col("Low").isNull()|col("Close").isNull()\
                                         |col("Adj Close").isNull()|col("Volume").isNull())

In [20]:
print("There are {} rows with missing values.".format(stock_df_missing_values.count()))

There are 9569 rows with missing values.


### Dropping Rows with Null Values

In [None]:
stock_df = stock_df.dropna(how = 'any')
stock_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in null_columns]).show()

### Features

#### Date Feautures: Day of Week, Month, Year

#### Lag 1, Lag2, Lag3, Lag4, Lag 5