In [3]:
! pip install simfin

Collecting simfin
  Downloading simfin-1.0.1-py3-none-any.whl.metadata (637 bytes)
Downloading simfin-1.0.1-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.6/61.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: simfin
Successfully installed simfin-1.0.1


In [5]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import logging
import simfin as sf

# Load API key from environment file
load_dotenv("API_KEY.env")
API_KEY = os.getenv("SIMFIN_API_KEY")
data_dir = os.getenv("SIMFIN_DATA_DIR")

sf.set_api_key(API_KEY)
sf.set_data_dir(data_dir)

df_companies = sf.load_companies(market='us')


Dataset "us-companies" on disk (0 days old).
- Loading from disk ... Done!


  df = pd.read_csv(path, sep=';', header=0,


In [7]:
df_shareprice = sf.load_shareprices(variant='daily')
df_incomest = sf.load_income(variant='quarterly')

Dataset "us-shareprices-daily" not on disk.
- Downloading ... 95.2%

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Done!
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
Dataset "us-income-quarterly" not on disk.
- Downloading ... 100.0%
- Extracting zip-file ... Done!
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!


In [11]:
import polars as pl

We define the companies that we want to extract before loading the csv files.

In [14]:
# Load the CSV with the correct separator
companies = pl.scan_csv("us-companies.csv", separator=";")

# Print schema to confirm columns are now correctly parsed
print(companies.schema)

# Select only needed columns
companies = companies.select(["Ticker", "Company Name", "IndustryId"])

# Filter for BigTech companies
BIGTECH_TICKERS = ["AAPL", "GOOG", "MSFT", "AMZN", "NVDA", "META"]
companies = companies.filter(pl.col("Ticker").is_in(BIGTECH_TICKERS))

# Collect into memory for further processing
companies = companies.collect()

# Display the filtered companies
print(companies)

Schema({'Ticker': String, 'SimFinId': Int64, 'Company Name': String, 'IndustryId': Int64, 'ISIN': String, 'End of financial year (month)': Int64, 'Number Employees': Int64, 'Business Summary': String, 'Market': String, 'CIK': Int64, 'Main Currency': String})
shape: (6, 3)
┌────────┬──────────────────────┬────────────┐
│ Ticker ┆ Company Name         ┆ IndustryId │
│ ---    ┆ ---                  ┆ ---        │
│ str    ┆ str                  ┆ i64        │
╞════════╪══════════════════════╪════════════╡
│ AAPL   ┆ APPLE INC            ┆ 101001     │
│ AMZN   ┆ AMAZON COM INC       ┆ 103002     │
│ GOOG   ┆ Alphabet (Google)    ┆ 101002     │
│ META   ┆ Meta Platforms, Inc. ┆ 101002     │
│ MSFT   ┆ MICROSOFT CORP       ┆ 101003     │
│ NVDA   ┆ NVIDIA CORP          ┆ 101004     │
└────────┴──────────────────────┴────────────┘


  print(companies.schema)


In [16]:
# Load income data with correct separator
income = pl.scan_csv("us-income-quarterly.csv", separator=";")

# Print schema to check correct column parsing
print(income.schema)

# Select only relevant columns
income = income.select([
    "Ticker", "Fiscal Year", "Fiscal Period", "Revenue", 
    "Operating Expenses", "Net Income"
])

# Filter for BigTech companies
BIGTECH_TICKERS = ["AAPL", "GOOG", "MSFT", "AMZN", "NVDA", "META"]
income = income.filter(pl.col("Ticker").is_in(BIGTECH_TICKERS))

# Collect into memory for further processing
income = income.collect()

# Display the first few rows
print(income)

Schema({'Ticker': String, 'SimFinId': Int64, 'Currency': String, 'Fiscal Year': Int64, 'Fiscal Period': String, 'Report Date': String, 'Publish Date': String, 'Restated Date': String, 'Shares (Basic)': Int64, 'Shares (Diluted)': Int64, 'Revenue': Int64, 'Cost of Revenue': Int64, 'Gross Profit': Int64, 'Operating Expenses': Int64, 'Selling, General & Administrative': Int64, 'Research & Development': Int64, 'Depreciation & Amortization': Int64, 'Operating Income (Loss)': Int64, 'Non-Operating Income (Loss)': Int64, 'Interest Expense, Net': Int64, 'Pretax Income (Loss), Adj.': Int64, 'Abnormal Gains (Losses)': Int64, 'Pretax Income (Loss)': Int64, 'Income Tax (Expense) Benefit, Net': Int64, 'Income (Loss) from Continuing Operations': Int64, 'Net Extraordinary Gains (Losses)': Int64, 'Net Income': Int64, 'Net Income (Common)': Int64})
shape: (115, 6)
┌────────┬─────────────┬───────────────┬─────────────┬────────────────────┬─────────────┐
│ Ticker ┆ Fiscal Year ┆ Fiscal Period ┆ Revenue   

  print(income.schema)


In [18]:
# Load daily share prices with correct separator
share_prices = pl.scan_csv("us-shareprices-daily.csv", separator=";")

# Print schema to confirm correct parsing
print(share_prices.schema)

# Select only needed columns
share_prices = share_prices.select(["Ticker", "Date", "Open", "Close"])

# Filter for BigTech tickers
share_prices = share_prices.filter(pl.col("Ticker").is_in(BIGTECH_TICKERS))

# Collect filtered data into memory
share_prices = share_prices.collect()

# Display the first few rows
print(share_prices)

  print(share_prices.schema)


Schema({'Ticker': String, 'SimFinId': Int64, 'Date': String, 'Open': Float64, 'High': Float64, 'Low': Float64, 'Close': Float64, 'Adj. Close': Float64, 'Volume': Int64, 'Dividend': Float64, 'Shares Outstanding': Int64})
shape: (7_440, 4)
┌────────┬────────────┬───────┬───────┐
│ Ticker ┆ Date       ┆ Open  ┆ Close │
│ ---    ┆ ---        ┆ ---   ┆ ---   │
│ str    ┆ str        ┆ f64   ┆ f64   │
╞════════╪════════════╪═══════╪═══════╡
│ AAPL   ┆ 2019-04-10 ┆ 49.67 ┆ 50.16 │
│ AAPL   ┆ 2019-04-11 ┆ 50.21 ┆ 49.74 │
│ AAPL   ┆ 2019-04-12 ┆ 49.8  ┆ 49.72 │
│ AAPL   ┆ 2019-04-15 ┆ 49.65 ┆ 49.81 │
│ AAPL   ┆ 2019-04-16 ┆ 49.87 ┆ 49.81 │
│ …      ┆ …          ┆ …     ┆ …     │
│ NVDA   ┆ 2024-03-07 ┆ 90.16 ┆ 92.67 │
│ NVDA   ┆ 2024-03-08 ┆ 95.14 ┆ 87.53 │
│ NVDA   ┆ 2024-03-11 ┆ 86.43 ┆ 85.77 │
│ NVDA   ┆ 2024-03-12 ┆ 88.05 ┆ 91.91 │
│ NVDA   ┆ 2024-03-13 ┆ 91.06 ┆ 90.89 │
└────────┴────────────┴───────┴───────┘


There are no null values in the dataset, so we can skip the step where we handle null values.

In [21]:
# Count missing values per column
income_null_counts = income.null_count()
print(income_null_counts)

share_prices_null_counts = share_prices.null_count()
print(share_prices_null_counts)

shape: (1, 6)
┌────────┬─────────────┬───────────────┬─────────┬────────────────────┬────────────┐
│ Ticker ┆ Fiscal Year ┆ Fiscal Period ┆ Revenue ┆ Operating Expenses ┆ Net Income │
│ ---    ┆ ---         ┆ ---           ┆ ---     ┆ ---                ┆ ---        │
│ u32    ┆ u32         ┆ u32           ┆ u32     ┆ u32                ┆ u32        │
╞════════╪═════════════╪═══════════════╪═════════╪════════════════════╪════════════╡
│ 0      ┆ 0           ┆ 0             ┆ 0       ┆ 0                  ┆ 0          │
└────────┴─────────────┴───────────────┴─────────┴────────────────────┴────────────┘
shape: (1, 4)
┌────────┬──────┬──────┬───────┐
│ Ticker ┆ Date ┆ Open ┆ Close │
│ ---    ┆ ---  ┆ ---  ┆ ---   │
│ u32    ┆ u32  ┆ u32  ┆ u32   │
╞════════╪══════╪══════╪═══════╡
│ 0      ┆ 0    ┆ 0    ┆ 0     │
└────────┴──────┴──────┴───────┘


Now we merge the companies based on Ticker.

In [24]:
merged_data = companies.join(income, on="Ticker", how="left")
merged_data = merged_data.join(share_prices, on="Ticker", how="left")
print(merged_data)

shape: (142_600, 11)
┌────────┬──────────────┬────────────┬─────────────┬───┬─────────────┬────────────┬───────┬───────┐
│ Ticker ┆ Company Name ┆ IndustryId ┆ Fiscal Year ┆ … ┆ Net Income  ┆ Date       ┆ Open  ┆ Close │
│ ---    ┆ ---          ┆ ---        ┆ ---         ┆   ┆ ---         ┆ ---        ┆ ---   ┆ ---   │
│ str    ┆ str          ┆ i64        ┆ i64         ┆   ┆ i64         ┆ str        ┆ f64   ┆ f64   │
╞════════╪══════════════╪════════════╪═════════════╪═══╪═════════════╪════════════╪═══════╪═══════╡
│ AAPL   ┆ APPLE INC    ┆ 101001     ┆ 2019        ┆ … ┆ 10044000000 ┆ 2019-04-10 ┆ 49.67 ┆ 50.16 │
│ AAPL   ┆ APPLE INC    ┆ 101001     ┆ 2019        ┆ … ┆ 10044000000 ┆ 2019-04-11 ┆ 50.21 ┆ 49.74 │
│ AAPL   ┆ APPLE INC    ┆ 101001     ┆ 2019        ┆ … ┆ 10044000000 ┆ 2019-04-12 ┆ 49.8  ┆ 49.72 │
│ AAPL   ┆ APPLE INC    ┆ 101001     ┆ 2019        ┆ … ┆ 10044000000 ┆ 2019-04-15 ┆ 49.65 ┆ 49.81 │
│ AAPL   ┆ APPLE INC    ┆ 101001     ┆ 2019        ┆ … ┆ 10044000000 ┆ 2019-04-

Now we save the merged data into a new csv file.

In [29]:
# Save merged data to CSV
merged_data.write_csv("processed_data.csv")