In [None]:
import pandas as pd
import numpy as np
import statistics as stat
import math

**Data Preprocessing And Cleaning**

In [12]:
#CLEAN DATA
# Read the CSV files
googl_df = pd.read_csv("./data/GOOGL.csv")
qqq_df = pd.read_csv("./data/QQQ_raw.csv")

# Trim QQQ data to start on 2004-08-19
qqq_df = qqq_df[qqq_df['date'] >= '2004-08-19']

# Remove the adjusted close columns from both files
googl_df.drop(columns=['Adj Close'], inplace=True)
qqq_df.drop(columns=['adjusted_close'], inplace=True)

# Add a change percent column to the GOOGL data
googl_df['change_percent'] = googl_df['Close'].pct_change() * 100

# Add a 20-day average volume column to the GOOGL file
googl_df['avg_vol_20d'] = googl_df['Volume'].rolling(window=20).mean()

# Rename columns of GOOGL file to match QQQ file
googl_df.rename(columns={
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Volume': 'volume'
}, inplace=True)

# Round all data in the dataframe for consistency
googl_df = googl_df.round(4)
qqq_df = qqq_df.round(4)

# Save the modified dataframes to new CSV files
googl_df.to_csv("data\cleaned_data\GOOGL_cleaned.csv", index=False)
qqq_df.to_csv("data\cleaned_data\QQQ_cleaned.csv",index=False)