sanity check of the data, do we have indeed 30 stock at all time for the DJIA, etc.

In [9]:
import pathlib
from collections import defaultdict

# Path to the parent directory containing year folders
data_dir = pathlib.Path("api-data-signal/")

year_to_companies = {}
company_to_years = defaultdict(set)

# Traverse each "year" folder
for year_folder in data_dir.iterdir():
    if year_folder.is_dir():
        year = year_folder.name
        companies = []
        # Each subfolder inside 'year_folder' is treated as a ticker
        for ticker_folder in year_folder.iterdir():
            if ticker_folder.is_dir():
                companies.append(ticker_folder.name)
                company_to_years[ticker_folder.name].add(year)
        year_to_companies[year] = companies

# Build output lines
output_lines = []
output_lines.append("Yearly structure:")
for year, companies in sorted(year_to_companies.items()):
    output_lines.append(f"{year}: {len(companies)} companies")
    output_lines.append(f"  {sorted(companies)}")

output_lines.append("\nCompany appearances across years:")
for company, years in sorted(company_to_years.items()):
    output_lines.append(f"{company}: appears in {len(years)} year(s)")
    output_lines.append(f"  {sorted(years)}")

# Write to a file in the "trading_strat_data/" folder
with open("trading_strat_data/summary_output.txt", "w", encoding="utf-8") as f:
    for line in output_lines:
        f.write(line + "\n")

print("Summary written to summary_output.txt")


Summary written to summary_output.txt


create single dataframe containing the sentiment values of interest for each stock, each year.

In [10]:
import pathlib
import json
import pandas as pd

# Set the path to the folder containing the year folders
data_dir = pathlib.Path("api-data-signal/")

# List to hold rows for the DataFrame
rows = []

# Iterate over each year folder
for year_folder in data_dir.iterdir():
    if year_folder.is_dir():
        year = year_folder.name  # e.g., "2010", "2011", etc.
        # Iterate over each company folder within the year folder
        for company_folder in year_folder.iterdir():
            if company_folder.is_dir():
                company = company_folder.name  # e.g., "AAPL", "AMZN", etc.
                info_file = company_folder / "10_K_info.txt"
                if info_file.exists():
                    try:
                        # Read and parse the JSON content from the file
                        with open(info_file, "r", encoding="utf-8") as f:
                            content = f.read().strip()
                            # If the file is pure JSON, we can load it directly
                            info = json.loads(content)
                    except Exception as e:
                        print(f"Error reading {info_file}: {e}")
                        info = {}
                    
                    # Extract sentiment and other values
                    sentiment = info.get("sentiment_score", {})
                    row = {
                        "Year": year,
                        "Company": company,
                        "sentiment_score_positive": sentiment.get("Positive"),
                        "sentiment_score_negative": sentiment.get("Negative"),
                        "sentiment_score_polarity": sentiment.get("Polarity"),
                        "sentiment_score_subjectivity": sentiment.get("Subjectivity"),
                        "similarity_score": info.get("similarity_score"),
                        "nlp_result": info.get("nlp_result")
                    }
                    rows.append(row)
                else:
                    print(f"File not found: {info_file}")

# Create DataFrame
df = pd.DataFrame(rows)

# Optionally convert 'Year' to numeric and sort the DataFrame
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df = df.sort_values(by=["Year", "Company"]).reset_index(drop=True)

# Display the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv("trading_strat_data/sentiment_analysis.csv", index=False)
print("DataFrame saved to sentiment_analysis.csv")


   Year Company  sentiment_score_positive  sentiment_score_negative  \
0  2010    AAPL                      66.0                     144.0   
1  2010    AMGN                     106.0                     106.0   
2  2010    AMZN                     110.0                      92.0   
3  2010     AXP                       1.0                       0.0   
4  2010      BA                     183.0                     379.0   

   sentiment_score_polarity  sentiment_score_subjectivity  similarity_score  \
0                 -0.371429                      0.047436          0.962567   
1                  0.000000                      0.045778          0.962118   
2                  0.089109                      0.056933          0.000000   
3                  0.999999                      0.041667          0.000000   
4                 -0.348754                      0.073889          0.982955   

   nlp_result  
0           0  
1           0  
2           0  
3           0  
4           0  
Da

In [16]:
# check the numebr of NaN's N/A's and any other missing values in the data
print(df.isnull().sum())

Year                            0
Company                         0
sentiment_score_positive        0
sentiment_score_negative        0
sentiment_score_polarity        0
sentiment_score_subjectivity    0
similarity_score                0
nlp_result                      0
dtype: int64


below we retrieve data, we take adjusted prices (take care of stock splits, etc)
then compute returns timeframe for 1)stocks 2)DJIA as a whole

In [17]:
import yfinance as yf
import pandas as pd

# Define tickers for stocks (example: 30 DJIA companies) and the DJIA ticker
tickers = [
    "AAPL", "AMGN", "AMZN", "AXP", "BA", "CAT", "CRM", "CSCO", "CVX",
    "DIS", "GS", "HD", "HON", "IBM", "JNJ", "JPM", "KO", "MCD", "MMM",
    "MRK", "MSFT", "NKE", "NVDA", "PG", "SHW", "TRV", "UNH", "V", "VZ", "WMT"
]
djia_ticker = "^DJI"

# Define date range
start_date = "2010-01-01"
end_date = "2023-12-31"

# Download adjusted daily closing data for stocks (adjusted prices)
stocks_data = yf.download(tickers, start=start_date, end=end_date, auto_adjust=True)["Close"]

# Resample to yearly frequency: Get first and last prices of each year
yearly_first = stocks_data.resample("Y").first()
yearly_last = stocks_data.resample("Y").last()

# Calculate yearly returns as (last / first) - 1
yearly_returns = (yearly_last / yearly_first) - 1

# Download adjusted DJIA data and compute yearly returns similarly
djia_data = yf.download(djia_ticker, start=start_date, end=end_date, auto_adjust=True)["Close"]
djia_yearly_first = djia_data.resample("Y").first()
djia_yearly_last = djia_data.resample("Y").last()
djia_yearly_returns = (djia_yearly_last / djia_yearly_first) - 1

# Create DataFrames to display
print("Yearly Stock Returns:")
print(yearly_returns.head())

print("\nYearly DJIA Returns:")
print(djia_yearly_returns.head())

# Optionally, save the results to CSV files:
yearly_returns.to_csv("trading_strat_data/yearly_stock_returns.csv")
djia_yearly_returns.to_csv("trading_strat_data/yearly_djia_returns.csv")


[*********************100%***********************]  30 of 30 completed
  yearly_first = stocks_data.resample("Y").first()
  yearly_last = stocks_data.resample("Y").last()
[*********************100%***********************]  1 of 1 completed

Yearly Stock Returns:
Ticker          AAPL      AMGN      AMZN       AXP        BA       CAT  \
Date                                                                     
2010-12-31  0.507220 -0.048856  0.344287  0.067831  0.190656  0.640398   
2011-12-31  0.228874  0.167906 -0.060363  0.104227  0.131701 -0.019657   
2012-12-31  0.305586  0.370442  0.401274  0.204688  0.039844 -0.020586   
2013-12-31  0.047508  0.303361  0.549843  0.558364  0.806243 -0.008824   
2014-12-31  0.426285  0.402191 -0.220167  0.051460 -0.026749  0.045588   

Ticker           CRM      CSCO       CVX       DIS  ...      MSFT       NKE  \
Date                                                ...                       
2010-12-31  0.764234 -0.180640  0.197271  0.182422  ... -0.079441  0.326869   
2011-12-31 -0.257248 -0.107640  0.193773  0.008190  ... -0.047546  0.135908   
2012-12-31  0.661067  0.079512  0.012824  0.319618  ...  0.025976  0.081710   
2013-12-31  0.289712  0.127669  0.168769  0.513566  ...  0.39543


  djia_yearly_first = djia_data.resample("Y").first()
  djia_yearly_last = djia_data.resample("Y").last()


In [21]:
# plot the returns of DJIA only among all the years
import matplotlib.pyplot as plt

import matplotlib
matplotlib.use('Agg')

# Plot the DJIA yearly returns
plt.figure(figsize=(10, 6))
plt.plot(djia_yearly_returns, marker="o", color="b", label="DJIA")
plt.title("Yearly DJIA Returns")
plt.xlabel("Year")
plt.ylabel("Returns")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig("trading_strat_data/yearly_djia_returns.png")
plt.close()
plt.show()

