# Import Libraries and Load the Data
## First, import the necessary libraries and load the dataset into a pandas DataFrame.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline
import datetime

# Load the Data

In [2]:
df = pd.read_csv('../data/raw/S&P 500 (SPX)HistoricalData_1730734525504.csv')

## Preview the dataset

In [3]:
df = pd.read_csv('../data/raw/S&P 500 (SPX)HistoricalData_1730734525504.csv')
df.head()

Unnamed: 0,Date,Close/Last,Open,High,Low
0,11/1/2024,5728.8,5723.22,5772.52,5723.22
1,10/31/2024,5705.45,5775.34,5775.34,5702.86
2,10/30/2024,5813.67,5832.65,5850.94,5811.28
3,10/29/2024,5832.92,5819.68,5847.19,5802.17
4,10/28/2024,5823.52,5833.93,5842.92,5823.08


In [4]:
df.tail()

Unnamed: 0,Date,Close/Last,Open,High,Low
1253,11/8/2019,3093.08,3081.25,3093.09,3073.58
1254,11/7/2019,3085.18,3087.02,3097.77,3080.23
1255,11/6/2019,3076.78,3075.1,3078.34,3065.89
1256,11/5/2019,3074.62,3080.8,3083.95,3072.15
1257,11/4/2019,3078.27,3078.96,3085.2,3074.87


## Find data information

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        1258 non-null   object 
 1   Close/Last  1258 non-null   float64
 2   Open        1258 non-null   float64
 3   High        1258 non-null   float64
 4   Low         1258 non-null   float64
dtypes: float64(4), object(1)
memory usage: 49.3+ KB


## Find  Columns and Rows

In [6]:
df.shape

(1258, 5)

# Data Cleaning and Preprocessing

## Check for null values

In [7]:
df.isnull().sum()
print(df.isnull().sum())

Date          0
Close/Last    0
Open          0
High          0
Low           0
dtype: int64


### Check duplicates

In [8]:
#df.duplicated().sum()
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


## Find all columns name

In [9]:
df.columns

Index(['Date', 'Close/Last', 'Open', 'High', 'Low'], dtype='object')

## Check Data Type

In [10]:
df.dtypes

Date           object
Close/Last    float64
Open          float64
High          float64
Low           float64
dtype: object

## Describe() method returns description of the data in the data DataFrame.

In [11]:
df.describe()

Unnamed: 0,Close/Last,Open,High,Low
count,1258.0,1258.0,1258.0,1258.0
mean,4169.503299,4168.775676,4193.290922,4142.695541
std,725.561927,725.625542,724.289642,727.032538
min,2237.4,2290.71,2300.73,2191.86
25%,3735.39,3733.255,3764.59,3709.3475
50%,4165.555,4164.18,4186.15,4134.955
75%,4541.9025,4546.78,4563.1325,4524.3
max,5864.67,5875.62,5878.46,5846.11


# Filter the dataframe and Display the filtered data

## Convert the date column to datetime format (assuming your date column is named 'Date')

In [12]:
df['Date'] = pd.to_datetime(df['Date'])

## Define the date range as strings

In [13]:
start_date = '2022-01-03'
end_date = '2023-03-23'

## Filter the dataframe

In [14]:
filtered_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

## Display the filtered data

In [15]:
filtered_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
filtered_df.head()


Unnamed: 0,Date,Close/Last,Open,High,Low
406,2023-03-23,3948.72,3959.21,4007.66,3919.05
407,2023-03-22,3936.97,4002.04,4039.49,3936.17
408,2023-03-21,4002.87,3975.89,4009.08,3971.19
409,2023-03-20,3951.57,3917.47,3956.62,3916.89
410,2023-03-17,3916.64,3958.69,3958.91,3901.27


In [16]:
filtered_df.tail()

Unnamed: 0,Date,Close/Last,Open,High,Low
708,2022-01-07,4677.03,4697.66,4707.95,4662.74
709,2022-01-06,4696.05,4693.39,4725.01,4671.26
710,2022-01-05,4700.58,4787.99,4797.7,4699.44
711,2022-01-04,4793.54,4804.51,4818.62,4774.27
712,2022-01-03,4796.56,4778.14,4796.64,4758.17


### Check skewness 

In [17]:
# Calculate skewness for numeric columns only
filtered_df.skew(numeric_only=True)

Close/Last    0.559388
Open          0.579080
High          0.575468
Low           0.567255
dtype: float64

### Describe() method returns description of the data in the data DataFrame after filter the Data

In [18]:
filtered_df.describe()

Unnamed: 0,Date,Close/Last,Open,High,Low
count,307,307.0,307.0,307.0,307.0
mean,2022-08-12 12:49:15.048859904,4080.20671,4081.25241,4114.720228,4043.739674
min,2022-01-03 00:00:00,3577.03,3520.37,3608.34,3491.58
25%,2022-04-23 12:00:00,3899.115,3886.025,3919.165,3853.035
50%,2022-08-12 00:00:00,4012.32,4015.54,4048.29,3976.9
75%,2022-11-30 12:00:00,4272.91,4267.84,4296.875,4205.605
max,2023-03-23 00:00:00,4796.56,4804.51,4818.62,4774.27
std,,269.365105,271.197527,268.940992,268.653499


# Remove columns

## Select only the 'Date' and 'Close' columns

In [19]:
selected_df = filtered_df[['Date', 'Close/Last']]


## Display the first few rows of the selected data

In [20]:
selected_df = filtered_df[['Date', 'Close/Last']]
selected_df.head()

Unnamed: 0,Date,Close/Last
406,2023-03-23,3948.72
407,2023-03-22,3936.97
408,2023-03-21,4002.87
409,2023-03-20,3951.57
410,2023-03-17,3916.64


In [21]:
selected_df.tail()

Unnamed: 0,Date,Close/Last
708,2022-01-07,4677.03
709,2022-01-06,4696.05
710,2022-01-05,4700.58
711,2022-01-04,4793.54
712,2022-01-03,4796.56


In [22]:
selected_df.dtypes

Date          datetime64[ns]
Close/Last           float64
dtype: object

Creat the S&P500(SPX) CSV file

In [23]:
selected_df.to_csv('../data/clean/02_S&P500(SPX)_clean_data.csv')

Clean S&P500(SPX) data CSV file send to data folder and save.