# File For Creating 'Y' Variables


The model will use x variables created from textual analysis to either

1) predict using a neurral network whether a stock should be bought, shorted, or held based on an earnigns cal
2) Cluster earnings calls based on the x variables. The Y variable will then be used to see if the groups show a pattern of grouping times when stocks should be bought, shorted, or held


There are multiple Y variables (and so many models we could try). A recommendation could be given based on the day of the call, 1 day after, 2 days after, 5 days after, 7 days after, or 10 days after. Meaning if a recommendation of 'buy' is given for the day of the call, then you would have made money if you bought right after the call and then cahsed out at the end of the day. Or if 'short' was given for the 5 day then you would have shorted right after the call and then cashed out before close on the 5th day after. Models should all use the same column for their y (dont mix and match 'Close_Classifier' with other 'XXXX_Classifier' and vice versa

In [1]:
import yfinance as yf
import pandas as pd
from datetime import timedelta
import os
import numpy as np

## Pull Stock Prices for Stocks in the Earnings Call Folder

### Lindsay's Code

In [29]:
import os
import yfinance as yf
import pandas as pd
from pandas.tseries.offsets import BDay  # For business day adjustments

# Define folder names and corresponding ticker symbols as lists
folder_names = ['NVIDIA', 'AAPL', 'AMZN']  # You can change this list before running the code
ticker_symbols = ['NVDA', 'AAPL', 'AMZN']  # These should match the respective folders

# Initialize a dictionary to store the results for each stock and SMH
stock_results = {}

# Path to the folder containing all the ticker folders (Earnings_Calls)
folder_path = os.path.abspath(os.path.join('./Earnings_Calls'))

# Step 1: Collect stock data and dates
all_unique_dates = set()  # A set to store all unique dates from all stocks

# Loop through each specified folder (which is a stock ticker) in the Earnings_Calls folder
for folder_name, ticker_symbol in zip(folder_names, ticker_symbols):
    ticker_folder_path = os.path.join(folder_path, folder_name)
    
    # Ensure we are working with directories (tickers)
    if os.path.isdir(ticker_folder_path):
        # Initialize a set to store unique dates for this ticker
        ticker_unique_dates = set()
        
        # Loop through each CSV file in the current ticker's folder
        for filename in os.listdir(ticker_folder_path):
            if filename.endswith('.csv'):
                file_path = os.path.join(ticker_folder_path, filename)  # Construct full file path
                
                # Read the CSV file
                df = pd.read_csv(file_path)
                
                # Convert 'mostimportantdateutc' column to datetime format
                df['mostimportantdateutc'] = pd.to_datetime(df['mostimportantdateutc'], errors='coerce')
                
                # Extract unique dates from 'mostimportantdateutc' column
                unique_dates = df['mostimportantdateutc'].dt.date.unique()
                
                # Add the unique dates to the set for this ticker
                ticker_unique_dates.update(unique_dates)
        
        # Convert the set to a sorted list of unique dates for the stock
        unique_dates_list = sorted(ticker_unique_dates)
        all_unique_dates.update(unique_dates_list)  # Collect all unique dates for later SMH pull
        
        # Download stock data for the current ticker
        if len(unique_dates_list) > 0:
            min_date = pd.to_datetime(min(unique_dates_list)) - BDay(3)  # Adjust start date by 3 business days
            max_date = pd.to_datetime(max(unique_dates_list))
            df_stock = yf.download(ticker_symbol, start=min_date, end=max_date, progress=False)
            df_stock = df_stock.reset_index()  # yfinance returns 'Date' as an index, so reset it to a column
            df_stock['Date'] = pd.to_datetime(df_stock['Date'])
            df_stock = df_stock.sort_values(by='Date').reset_index(drop=True)
            
            # Initialize a list to store results for this ticker
            stock_results[folder_name] = []
            
            # Loop through each date in the unique dates list
            for date in unique_dates_list:
                date = pd.to_datetime(date)
                
                # Find the row corresponding to 'date' in stock data
                if date in df_stock['Date'].values:
                    current_stock_row = df_stock[df_stock['Date'] == date]
                    current_index_stock = current_stock_row.index[0]
                    
                    # Get the stock's open and close prices for the current date
                    open_price_stock = current_stock_row['Open'].values[0]
                    close_price_stock = current_stock_row['Close'].values[0]
                    
                    # Get the stock's closing price from the business day before the earnings call using BDay(-1)
                    previous_business_day = date - BDay(1)
                    close_price_stock_day_before = df_stock.loc[df_stock['Date'] == previous_business_day, 'Close'].values[0] if previous_business_day in df_stock['Date'].values else None
                    
                    # Get stock's subsequent close prices for 1, 2, 5, 7, 10 business days using BDay for future dates
                    future_business_day_1 = date + BDay(1)
                    future_business_day_2 = date + BDay(2)
                    future_business_day_5 = date + BDay(5)
                    future_business_day_7 = date + BDay(7)
                    future_business_day_10 = date + BDay(10)
                    
                    close_price_stock_1d = df_stock.loc[df_stock['Date'] == future_business_day_1, 'Close'].values[0] if future_business_day_1 in df_stock['Date'].values else None
                    close_price_stock_2d = df_stock.loc[df_stock['Date'] == future_business_day_2, 'Close'].values[0] if future_business_day_2 in df_stock['Date'].values else None
                    close_price_stock_5d = df_stock.loc[df_stock['Date'] == future_business_day_5, 'Close'].values[0] if future_business_day_5 in df_stock['Date'].values else None
                    close_price_stock_7d = df_stock.loc[df_stock['Date'] == future_business_day_7, 'Close'].values[0] if future_business_day_7 in df_stock['Date'].values else None
                    close_price_stock_10d = df_stock.loc[df_stock['Date'] == future_business_day_10, 'Close'].values[0] if future_business_day_10 in df_stock['Date'].values else None
                    
                    # Store the result for this date (only stock data)
                    stock_results[folder_name].append({
                        'Date': date,
                        'Close_Day_Before': close_price_stock_day_before,  # Add closing price from the business day before
                        'Stock_Open': open_price_stock,
                        'Stock_Close': close_price_stock,
                        'Stock_Close_1d': close_price_stock_1d,
                        'Stock_Close_2d': close_price_stock_2d,
                        'Stock_Close_5d': close_price_stock_5d,
                        'Stock_Close_7d': close_price_stock_7d,
                        'Stock_Close_10d': close_price_stock_10d
                    })

## Pull Prices for SMH (ETF Which trakcs SemiConducotrs) For all Dates of Earnings Calls

In [30]:
# Step 2: Pull SMH data for the same dates as all stocks combined
all_unique_dates_list = sorted(all_unique_dates)

# Download SMH data
df_smh = yf.download('SMH', start=min(all_unique_dates_list), end=max(all_unique_dates_list), progress=False)
df_smh = df_smh.reset_index()
df_smh['Date'] = pd.to_datetime(df_smh['Date'])
df_smh = df_smh.sort_values(by='Date').reset_index(drop=True)

# Step 3: Store SMH data for each date
for date in all_unique_dates_list:
    date = pd.to_datetime(date)
    
    if date in df_smh['Date'].values:
        current_smh_row = df_smh[df_smh['Date'] == date]
        current_index_smh = current_smh_row.index[0]

        #get the close price for the day before
        previous_business_day = date - BDay(1)
        close_price_stock_day_before = df_smh.loc[df_smh['Date'] == previous_business_day, 'Close'].values[0] if previous_business_day in df_smh['Date'].values else None
        
        # Get SMH's open and close prices for the current date
        open_price_smh = current_smh_row['Open'].values[0]
        close_price_smh = current_smh_row['Close'].values[0]
        
        # Get SMH's subsequent close prices for 1, 2, 5, 7, 10 days
        close_price_smh_1d = df_smh.loc[current_index_smh + 1, 'Close'] if current_index_smh + 1 < len(df_smh) else None
        close_price_smh_2d = df_smh.loc[current_index_smh + 2, 'Close'] if current_index_smh + 2 < len(df_smh) else None
        close_price_smh_5d = df_smh.loc[current_index_smh + 5, 'Close'] if current_index_smh + 5 < len(df_smh) else None
        close_price_smh_7d = df_smh.loc[current_index_smh + 7, 'Close'] if current_index_smh + 7 < len(df_smh) else None
        close_price_smh_10d = df_smh.loc[current_index_smh + 10, 'Close'] if current_index_smh + 10 < len(df_smh) else None
        
        # Store the result for SMH for this date
        smh_results.append({
            'Date': date,
            'SMH_Close_Day_Before': close_price_stock_day_before,
            'SMH_Open': open_price_smh,
            'SMH_Close': close_price_smh,
            'SMH_Close_1d': close_price_smh_1d,
            'SMH_Close_2d': close_price_smh_2d,
            'SMH_Close_5d': close_price_smh_5d,
            'SMH_Close_7d': close_price_smh_7d,
            'SMH_Close_10d': close_price_smh_10d
        })

## Create 1 DF of All Stocks and SMH

In [31]:
# Step 4: Flatten stock results and convert them to DataFrame
all_data = []
for stock, stock_data in stock_results.items():
    for entry in stock_data:
        entry['Stock'] = stock  # Add the stock ticker to each entry
        all_data.append(entry)

df_stock_results = pd.DataFrame(all_data)

# Reorder stock data columns if needed
df_stock_results = df_stock_results[['Date', 'Stock', 'Close_Day_Before', 'Stock_Open', 'Stock_Close', 'Stock_Close_1d', 'Stock_Close_2d', 'Stock_Close_5d', 'Stock_Close_7d', 'Stock_Close_10d']]

# Display results
df_stock_results.head()

Unnamed: 0,Date,Stock,Close_Day_Before,Stock_Open,Stock_Close,Stock_Close_1d,Stock_Close_2d,Stock_Close_5d,Stock_Close_7d,Stock_Close_10d
0,2010-02-17,NVIDIA,0.44175,0.44725,0.446,0.41675,0.4145,0.41375,0.405,0.41575
1,2010-05-13,NVIDIA,0.367,0.3655,0.36625,0.324,0.32475,0.3115,0.309,0.3375
2,2010-08-12,NVIDIA,0.222,0.218,0.224,0.23475,0.22875,0.247,0.2455,0.245
3,2010-11-11,NVIDIA,0.3185,0.311,0.31525,0.3315,0.3275,0.333,0.344,
4,2011-02-16,NVIDIA,0.56375,0.57125,0.5845,0.642,0.64075,0.55275,0.578,0.51875


In [32]:
# Step 4: Flatten stock results and convert them to DataFrame
all_data = []
for stock, stock_data in stock_results.items():
    for entry in stock_data:
        entry['Stock'] = stock  # Add the stock ticker to each entry
        all_data.append(entry)

df_stock_results = pd.DataFrame(all_data)

# Step 5: Convert SMH results to DataFrame
df_smh_results = pd.DataFrame(smh_results)

# Reorder stock data columns if needed
df_stock_results = df_stock_results[['Date', 'Stock', 'Close_Day_Before', 'Stock_Open', 'Stock_Close', 'Stock_Close_1d', 'Stock_Close_2d', 'Stock_Close_5d', 'Stock_Close_7d', 'Stock_Close_10d']]

# Display results
df_stock_results.head()

Unnamed: 0,Date,Stock,Close_Day_Before,Stock_Open,Stock_Close,Stock_Close_1d,Stock_Close_2d,Stock_Close_5d,Stock_Close_7d,Stock_Close_10d
0,2010-02-17,NVIDIA,0.44175,0.44725,0.446,0.41675,0.4145,0.41375,0.405,0.41575
1,2010-05-13,NVIDIA,0.367,0.3655,0.36625,0.324,0.32475,0.3115,0.309,0.3375
2,2010-08-12,NVIDIA,0.222,0.218,0.224,0.23475,0.22875,0.247,0.2455,0.245
3,2010-11-11,NVIDIA,0.3185,0.311,0.31525,0.3315,0.3275,0.333,0.344,
4,2011-02-16,NVIDIA,0.56375,0.57125,0.5845,0.642,0.64075,0.55275,0.578,0.51875


In [26]:
# Step 1: Add a 'Stock' column to df_smh_results with the value 'SMH'
df_smh_results['Stock'] = 'SMH'

# Step 2: Rename columns in df_smh_results to match df_stock_results
df_smh_results = df_smh_results.rename(columns={
    'SMH_Close_Day_Before': 'Close_Day_Before',
    'SMH_Open': 'Stock_Open',
    'SMH_Close': 'Stock_Close',
    'SMH_Close_1d': 'Stock_Close_1d',
    'SMH_Close_2d': 'Stock_Close_2d',
    'SMH_Close_5d': 'Stock_Close_5d',
    'SMH_Close_7d': 'Stock_Close_7d',
    'SMH_Close_10d': 'Stock_Close_10d'
})

# Step 3: Concatenate df_stock_results and df_smh_results
df_combined = pd.concat([df_stock_results, df_smh_results], ignore_index=True)

# Display the concatenated DataFrame
df_combined


Unnamed: 0,Date,Stock,Close_Day_Before,Stock_Open,Stock_Close,Stock_Close_1d,Stock_Close_2d,Stock_Close_5d,Stock_Close_7d,Stock_Close_10d
0,2010-02-17,NVIDIA,0.441750,0.447250,0.446000,0.416750,0.414500,0.413750,0.405000,0.415750
1,2010-05-13,NVIDIA,0.367000,0.365500,0.366250,0.324000,0.324750,0.311500,0.309000,0.337500
2,2010-08-12,NVIDIA,0.222000,0.218000,0.224000,0.234750,0.228750,0.247000,0.245500,0.245000
3,2010-11-11,NVIDIA,0.318500,0.311000,0.315250,0.331500,0.327500,0.333000,0.344000,
4,2011-02-16,NVIDIA,0.563750,0.571250,0.584500,0.642000,0.640750,0.552750,0.578000,0.518750
...,...,...,...,...,...,...,...,...,...,...
330,2024-02-01,SMH,185.869995,186.729996,187.570007,191.220001,194.949997,199.429993,203.059998,202.619995
331,2024-02-21,SMH,197.779999,195.639999,196.509995,209.929993,208.619995,207.410004,220.539993,226.350006
332,2024-04-30,SMH,218.380005,217.339996,214.089996,207.850006,212.029999,220.729996,219.979996,227.080002
333,2024-05-02,SMH,207.850006,211.369995,212.029999,217.729996,222.669998,219.979996,223.259995,232.199997


## Calculate the % Change Based on Day of Call

In [33]:
# Calculate percentage change for each close price column based on 'Stock_Open'
df_combined['%Change_Close_1d'] = ((df_combined['Stock_Close_1d'] - df_combined['Close_Day_Before']) / df_combined['Close_Day_Before']) * 100
df_combined['%Change_Close_2d'] = ((df_combined['Stock_Close_2d'] - df_combined['Close_Day_Before']) / df_combined['Close_Day_Before']) * 100
df_combined['%Change_Close_5d'] = ((df_combined['Stock_Close_5d'] - df_combined['Close_Day_Before']) / df_combined['Close_Day_Before']) * 100
df_combined['%Change_Close_7d'] = ((df_combined['Stock_Close_7d'] - df_combined['Close_Day_Before']) / df_combined['Close_Day_Before']) * 100
df_combined['%Change_Close_10d'] = ((df_combined['Stock_Close_10d'] - df_combined['Close_Day_Before']) / df_combined['Close_Day_Before']) * 100

# Display the updated DataFrame
df_combined.head(30)


Unnamed: 0,Date,Stock,Close_Day_Before,Stock_Open,Stock_Close,Stock_Close_1d,Stock_Close_2d,Stock_Close_5d,Stock_Close_7d,Stock_Close_10d,%Change_Close_1d,%Change_Close_2d,%Change_Close_5d,%Change_Close_7d,%Change_Close_10d
0,2010-02-17,NVIDIA,0.44175,0.44725,0.446,0.41675,0.4145,0.41375,0.405,0.41575,-5.659304,-6.168646,-6.338426,-8.319183,-5.885681
1,2010-05-13,NVIDIA,0.367,0.3655,0.36625,0.324,0.32475,0.3115,0.309,0.3375,-11.716624,-11.512263,-15.122615,-15.803822,-8.038149
2,2010-08-12,NVIDIA,0.222,0.218,0.224,0.23475,0.22875,0.247,0.2455,0.245,5.743243,3.040542,11.261257,10.585583,10.360361
3,2010-11-11,NVIDIA,0.3185,0.311,0.31525,0.3315,0.3275,0.333,0.344,,4.081627,2.825737,4.552588,8.006279,
4,2011-02-16,NVIDIA,0.56375,0.57125,0.5845,0.642,0.64075,0.55275,0.578,0.51875,13.880264,13.658529,-1.951226,2.527713,-7.982264
5,2011-05-12,NVIDIA,0.49675,0.495,0.5125,0.4565,0.4425,0.4445,0.45175,0.4705,-8.102668,-10.920987,-10.518369,-9.05888,-5.284349
6,2011-08-11,NVIDIA,0.3085,0.3085,0.33525,0.322,0.33425,0.2985,0.2985,0.312,4.376015,8.346843,-3.241488,-3.241488,1.134527
7,2011-11-10,NVIDIA,0.358,0.36675,0.36175,0.3745,0.36725,0.3515,0.36575,,4.608937,2.583795,-1.815644,2.164806,
8,2012-02-15,NVIDIA,0.406,0.40975,0.40425,0.41125,0.39625,0.3955,0.39475,0.37875,1.293105,-2.401473,-2.586203,-2.770933,-6.711821
9,2012-05-11,NVIDIA,0.3105,0.33625,0.33025,0.32825,0.325,0.302,0.3035,0.31,5.716585,4.669885,-2.737523,-2.254428,-0.161029


## Create Recommendations Based on Stock Movement Realtive to the Market

In [34]:
import numpy as np

# First, separate SMH data from the combined DataFrame
df_smh = df_combined[df_combined['Stock'] == 'SMH'].set_index('Date')

# Filter non-SMH stocks and explicitly create a copy
df_stocks = df_combined[df_combined['Stock'] != 'SMH'].copy()

# Function to create classifiers based on the logic
def classify_stock(stock_change, smh_change):
    if (stock_change - smh_change >= 0.5) and (stock_change > 0):
        return 'buy'
    elif (stock_change - smh_change <= -0.5) and (stock_change < 0):
        return 'short'
    else:
        return 'hold'

# Apply the classification logic for each percentage change column
for period in ['Close_1d', 'Close_2d', 'Close_5d', 'Close_7d', 'Close_10d']:
    # Column name for SMH and stock % change
    stock_column = f'%Change_{period}'
    smh_column = f'%Change_{period}'
    
    # Create a classifier column based on the logic
    df_stocks[f'{period}_Classifier'] = df_stocks.apply(
        lambda row: classify_stock(row[stock_column], df_smh.loc[row['Date'], smh_column]),
        axis=1
    )

# Display the updated DataFrame
df_stocks


Unnamed: 0,Date,Stock,Close_Day_Before,Stock_Open,Stock_Close,Stock_Close_1d,Stock_Close_2d,Stock_Close_5d,Stock_Close_7d,Stock_Close_10d,%Change_Close_1d,%Change_Close_2d,%Change_Close_5d,%Change_Close_7d,%Change_Close_10d,Close_1d_Classifier,Close_2d_Classifier,Close_5d_Classifier,Close_7d_Classifier,Close_10d_Classifier
0,2010-02-17,NVIDIA,0.441750,0.447250,0.446000,0.416750,0.414500,0.413750,0.405000,0.415750,-5.659304,-6.168646,-6.338426,-8.319183,-5.885681,short,short,short,short,short
1,2010-05-13,NVIDIA,0.367000,0.365500,0.366250,0.324000,0.324750,0.311500,0.309000,0.337500,-11.716624,-11.512263,-15.122615,-15.803822,-8.038149,short,short,short,short,short
2,2010-08-12,NVIDIA,0.222000,0.218000,0.224000,0.234750,0.228750,0.247000,0.245500,0.245000,5.743243,3.040542,11.261257,10.585583,10.360361,buy,buy,buy,buy,buy
3,2010-11-11,NVIDIA,0.318500,0.311000,0.315250,0.331500,0.327500,0.333000,0.344000,,4.081627,2.825737,4.552588,8.006279,,buy,buy,buy,buy,hold
4,2011-02-16,NVIDIA,0.563750,0.571250,0.584500,0.642000,0.640750,0.552750,0.578000,0.518750,13.880264,13.658529,-1.951226,2.527713,-7.982264,buy,buy,hold,buy,short
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,2023-04-27,AMZN,104.980003,108.160004,109.820000,105.449997,102.050003,104.000000,105.830002,112.180000,0.447698,-2.791008,-0.933514,0.809677,6.858446,hold,short,short,hold,buy
169,2023-08-03,AMZN,128.210007,127.480003,128.910004,139.570007,142.220001,138.559998,140.570007,133.979996,8.860463,10.927380,8.072686,9.640434,4.500420,buy,buy,buy,buy,buy
170,2023-10-26,AMZN,121.389999,120.629997,119.570000,127.739998,132.710007,138.070007,139.740005,140.600006,5.231072,9.325321,13.740842,15.116572,15.825032,buy,buy,buy,buy,buy
171,2024-02-01,AMZN,155.199997,155.869995,159.279999,171.809998,170.309998,169.839996,172.339996,169.800003,10.702320,9.735825,9.432989,11.043814,9.407221,buy,buy,buy,buy,hold


## Output File to the Main Folder

In [64]:
# Define the path to the folder one level above 'Earnings_Calls'
parent_folder_path = os.path.abspath(os.path.join('Earnings_Calls', '..'))

# Define the full path where the CSV file will be saved
output_csv_path = os.path.join(parent_folder_path, 'df_stocks_final.csv')

# Save the DataFrame to CSV
df_stocks.to_csv(output_csv_path, index=False)

print(f"File saved at: {output_csv_path}")


File saved at: /Users/hadenloveridge/Desktop/Analytics for Unstructured Data/Assignments/Final_Project/df_stocks_final.csv
