In [4]:
# Import required libraries

import pandas as pd
import numpy as np

Make sure there is an outputs folder in the data_cleaning Folder or else this cell will not execute properly

In [11]:
# Load data
fundamentals = pd.read_csv('../data/raw/fundamentals.csv')
prices = pd.read_csv('../data/raw/prices-split-adjusted.csv')
securities = pd.read_csv('../data/raw/securities.csv')

# Check missing values in the datasets and output that into a csv file
df = pd.DataFrame(fundamentals.isnull().sum())
df.to_csv('outputs/fundamentals_column_error_count.csv')
df = pd.DataFrame(prices.isnull().sum())
df.to_csv('outputs/prices_column_error_count.csv')
df = pd.DataFrame(securities.isnull().sum())
df.to_csv('outputs/securities_column_error_count.csv')


In [12]:
# Drop rows which have missing values in the identified columns

columns_to_check = ['Cash Ratio', 'Current Ratio', 'Quick Ratio', 'For Year', 'Earnings Per Share', 'Estimated Shares Outstanding']
fundamentals = fundamentals.dropna(subset=columns_to_check)

# Drop useless columns
securities.drop(columns=['Date first added'], inplace=True)

In [13]:
# Checking the date formats in the date columns
prices['date'] = pd.to_datetime(prices['date'])
fundamentals['Period Ending'] = pd.to_datetime(fundamentals['Period Ending'])

# Drop duplicates
fundamentals.drop_duplicates(inplace=True)
prices.drop_duplicates(inplace=True)
securities.drop_duplicates(inplace=True)

In [14]:
# Keep rows for ticker symbols which are present in both fundamentals.csv an prices.csv

ticker_symbols_prices = prices['symbol'].unique()
fundamentals = fundamentals[fundamentals['Ticker Symbol'].isin(ticker_symbols_prices)]
ticker_symbols_fundamentals = fundamentals['Ticker Symbol'].unique()
prices = prices[prices['symbol'].isin(ticker_symbols_fundamentals)]
securities = securities[securities['Ticker symbol'].isin(ticker_symbols_fundamentals)]

# Save the final list of ticker symbols
df = pd.DataFrame(ticker_symbols_fundamentals)
df.to_csv('outputs/ticker_symbols.csv')

In [25]:
prices.sort_values(by=['symbol', 'date'], inplace=True)
fundamentals.sort_values(by=['Ticker Symbol', 'Period Ending'], inplace=True)
securities.sort_values(by='Ticker symbol', inplace=True)

In [28]:
# Save the cleaned datasets
fundamentals.to_csv('./data/clean/fundamentals.csv')
prices.to_csv('../data/clean/prices.csv')
securities.to_csv('../data/clean/securities.csv')

OSError: Cannot save file into a non-existent directory: '../data/clean'

In [27]:
# Merge GICS Sector from securities.csv into fundamentals.csv
securities = securities[['Ticker symbol', 'GICS Sector']]
fundamentals = fundamentals.merge(securities, left_on='Ticker Symbol', right_on='Ticker symbol', how='left')
fundamentals = fundamentals.drop(columns=['Ticker symbol'])
fundamentals = fundamentals.rename(columns={'GICS Sector': 'Sector'})

# Create a new column for the next quarter's average stock price
fundamentals['Next Qtr. Avg. Price'] = None

# Calculate the next quarter's average stock price
for index, row in fundamentals.iterrows():
    symbol = row['Ticker Symbol']
    period_ending = row['Period Ending']
    
    # Define the next quarter period
    start_date = period_ending
    end_date = start_date + pd.DateOffset(months=3)
    
    # Get the stock prices for the next quarter
    next_quarter_prices = prices[(prices['symbol'] == symbol) & (prices['date'] > start_date) & (prices['date'] <= end_date)]['close']
    
    # Calculate the rolling average stock price for the next quarter
    if len(next_quarter_prices) > 0: 
        rolling_avg_price = next_quarter_prices.rolling(window=3, min_periods=1).mean().mean()
    else: 
        rolling_avg_price = None
    
    # Update the DataFrame
    fundamentals.at[index, 'Next Qtr. Avg. Price'] = rolling_avg_price

# Convert columns to numeric, except 'Ticker Symbol', 'Period Ending', and 'Sector'
cols_to_convert = [col for col in fundamentals.columns if col not in ['Ticker Symbol', 'Period Ending', 'Sector']]
fundamentals[cols_to_convert] = fundamentals[cols_to_convert].apply(pd.to_numeric, errors='coerce')

# Drop redundant first column
fundamentals = fundamentals.drop(fundamentals.columns[0], axis=1)

# Save the final merged dataset
fundamentals.to_csv('../data/merged_data.csv', index=False)


KeyboardInterrupt: 