In [2]:
import os
import pandas as pd
import requests
import time

In [3]:
stock_data_path = "/kaggle/input/stock-market-dataset/stocks"
financials_path = "/kaggle/input/financial-statements-of-major-companies2009-2023/Financial Statements.csv"
target_symbols = ["AAPL", "MSFT", "GOOGL", "AMZN", "TSLA", "META", "NFLX", "NVDA", "INTC"]

all_stock_data = []

for file in os.listdir(stock_data_path):
    if file.endswith(".csv"):
        symbol = file.replace(".csv", "").upper()
        if symbol in target_symbols:
            df = pd.read_csv(os.path.join(stock_data_path, file))
            df.columns = df.columns.str.strip().str.title().str.replace(" ", "")
            df['Symbol'] = symbol
            df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # Handle invalid dates
            df = df.dropna(subset=['Date', 'Open', 'Close'])
            df = df[df['Date'].notnull()]
            df = df.drop(columns=['Openint'], errors='ignore')
            all_stock_data.append(df)
fact_stock_prices = pd.concat(all_stock_data, ignore_index=True)

fact_stock_prices['Volume'] = pd.to_numeric(fact_stock_prices['Volume'], errors='coerce')
fact_stock_prices = fact_stock_prices.dropna(subset=['Volume'])

In [4]:
fact_stock_prices

Unnamed: 0,Date,Open,High,Low,Close,AdjClose,Volume,Symbol
0,2004-08-19,50.050049,52.082081,48.028027,50.220219,50.220219,44659000.0,GOOGL
1,2004-08-20,50.555557,54.594593,50.300301,54.209209,54.209209,22834300.0,GOOGL
2,2004-08-23,55.430431,56.796795,54.579578,54.754753,54.754753,18256100.0,GOOGL
3,2004-08-24,55.675674,55.855854,51.836838,52.487488,52.487488,15247300.0,GOOGL
4,2004-08-25,52.532532,54.054054,51.991993,53.053055,53.053055,9188600.0,GOOGL
...,...,...,...,...,...,...,...,...
50562,2020-03-26,51.740002,55.950001,51.660000,55.540001,55.540001,41459800.0,INTC
50563,2020-03-27,53.419998,54.639999,52.070000,52.369999,52.369999,31633500.0,INTC
50564,2020-03-30,52.990002,56.099998,52.830002,55.490002,55.490002,31628600.0,INTC
50565,2020-03-31,55.060001,55.799999,53.220001,54.119999,54.119999,48074700.0,INTC


In [5]:
financials_df = pd.read_csv(financials_path)
financials_df.columns = financials_df.columns.str.strip().str.replace('\(.*?\)', '', regex=True).str.replace(' ', '_')
financials_df = financials_df.rename(columns={'Company': 'Symbol'})
financials_df = financials_df[financials_df['Symbol'].isin(target_symbols)]
numeric_cols = financials_df.select_dtypes(include='number').columns
financials_df[numeric_cols] = financials_df[numeric_cols].fillna(0)
financials_df = financials_df.drop_duplicates(subset=['Symbol', 'Year'])
dim_financials = financials_df.copy()
dim_financials

Unnamed: 0,Year,Symbol,Category,Market_Cap,Revenue,Gross_Profit,Net_Income,Earning_Per_Share,EBITDA,Share_Holder_Equity,...,Current_Ratio,Debt/Equity_Ratio,ROE,ROA,ROI,Net_Profit_Margin,Free_Cash_Flow_per_Share,Return_on_Tangible_Equity,Number_of_Employees,Inflation_Rate
0,2022,AAPL,IT,2066.94,394328.0,170782.0,99803.0,6.1100,130541.0,50672.0,...,0.8794,2.3695,196.9589,28.2924,66.6994,25.3096,1.3146,196.9589,164000,8.0028
1,2021,AAPL,IT,2913.28,365817.0,152836.0,94680.0,5.6100,120233.0,63090.0,...,1.0746,1.9768,150.0713,26.9742,54.9839,25.8818,1.3261,150.0713,154000,4.6979
2,2020,AAPL,IT,2255.97,274515.0,104956.0,57411.0,3.2800,77344.0,65339.0,...,1.3636,1.7208,87.8664,17.7256,35.0054,20.9136,1.0183,87.8664,147000,1.2336
3,2019,AAPL,IT,1304.76,260174.0,98392.0,55256.0,2.9700,76477.0,90488.0,...,1.5401,1.1940,61.0645,16.3230,30.3113,21.2381,-0.0388,61.0645,137000,1.8122
4,2018,AAPL,IT,748.54,265595.0,101839.0,59531.0,2.9800,81801.0,107147.0,...,1.1329,1.0685,55.5601,16.2775,29.6348,22.4142,0.7414,55.5601,132000,2.4426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,2013,AMZN,LOGI,182.54,74452.0,20271.0,274.0,0.0295,3998.0,9746.0,...,1.0716,0.3274,2.8114,0.6823,2.1180,0.3680,0.1748,3.8641,117300,1.4648
157,2012,AMZN,LOGI,113.63,61093.0,15122.0,-39.0,-0.0045,2835.0,8192.0,...,1.1207,0.3765,-0.4761,-0.1198,-0.3459,-0.0638,-0.1833,-0.6915,88400,2.0693
158,2011,AMZN,LOGI,78.72,48077.0,10789.0,631.0,0.0685,1945.0,7757.0,...,1.1741,0.0329,8.1346,2.4962,7.8757,1.3125,-0.0490,10.8756,56200,3.1568
159,2010,AMZN,LOGI,80.79,34204.0,7643.0,1152.0,0.1265,1974.0,6864.0,...,1.3254,0.2274,16.7832,6.1286,13.6736,3.3680,-0.0544,20.8885,33700,1.6400


In [6]:
dim_date = pd.DataFrame({'Date': pd.to_datetime(fact_stock_prices['Date'].unique())})
dim_date['Date'] = pd.to_datetime(dim_date['Date'])
dim_date['Year'] = dim_date['Date'].dt.year
dim_date['Quarter'] = dim_date['Date'].dt.quarter
dim_date['Month'] = dim_date['Date'].dt.month
dim_date['Month_Name'] = dim_date['Date'].dt.month_name()
dim_date['Day'] = dim_date['Date'].dt.day
dim_date['Day_Name'] = dim_date['Date'].dt.day_name()
dim_date['Week'] = dim_date['Date'].dt.isocalendar().week
dim_date['Weekday'] = dim_date['Date'].dt.weekday + 1  # 1=Monday, 7=Sunday
dim_date['Is_Weekend'] = dim_date['Weekday'].isin([6, 7])
dim_date['Day_Of_Year'] = dim_date['Date'].dt.dayofyear
dim_date['Is_Month_Start'] = dim_date['Date'].dt.is_month_start
dim_date['Is_Month_End'] = dim_date['Date'].dt.is_month_end
dim_date['Is_Quarter_Start'] = dim_date['Date'].dt.is_quarter_start
dim_date['Is_Quarter_End'] = dim_date['Date'].dt.is_quarter_end
dim_date['Is_Year_Start'] = dim_date['Date'].dt.is_year_start
dim_date['Is_Year_End'] = dim_date['Date'].dt.is_year_end
dim_date

Unnamed: 0,Date,Year,Quarter,Month,Month_Name,Day,Day_Name,Week,Weekday,Is_Weekend,Day_Of_Year,Is_Month_Start,Is_Month_End,Is_Quarter_Start,Is_Quarter_End,Is_Year_Start,Is_Year_End
0,2004-08-19,2004,3,8,August,19,Thursday,34,4,False,232,False,False,False,False,False,False
1,2004-08-20,2004,3,8,August,20,Friday,34,5,False,233,False,False,False,False,False,False
2,2004-08-23,2004,3,8,August,23,Monday,35,1,False,236,False,False,False,False,False,False
3,2004-08-24,2004,3,8,August,24,Tuesday,35,2,False,237,False,False,False,False,False,False
4,2004-08-25,2004,3,8,August,25,Wednesday,35,3,False,238,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10093,1980-12-08,1980,4,12,December,8,Monday,50,1,False,343,False,False,False,False,False,False
10094,1980-12-09,1980,4,12,December,9,Tuesday,50,2,False,344,False,False,False,False,False,False
10095,1980-12-10,1980,4,12,December,10,Wednesday,50,3,False,345,False,False,False,False,False,False
10096,1980-12-11,1980,4,12,December,11,Thursday,50,4,False,346,False,False,False,False,False,False


In [7]:
API_KEY = "7A8NGY5D090UZDRV"
company_info = []
for symbol in target_symbols:
    url = f'https://www.alphavantage.co/query?function=OVERVIEW&symbol={symbol}&apikey={API_KEY}'
    print(f"fetching data for symbol {url}")
    response = requests.get(url)
    print(f"fetching data for symbol {symbol}")
    if response.status_code == 200:
        data = response.json()
        if data:  # Ensure response is not empty
            company_info.append({
                "Symbol": symbol,
                "Name": data.get("Name"),
                "Description": data.get("Description"),
                "Sector": data.get("Sector"),
                "Industry": data.get("Industry"),
                "Exchange": data.get("Exchange"),
                "Country": data.get("Country"),
                "Currency": data.get("Currency"),
                "MarketCapitalization": data.get("MarketCapitalization")
            })
            print(f"fetched data from symbol {symbol}")
    else:
        print(f"Failed to fetch for {symbol}")
    
dim_company = pd.DataFrame(company_info)
dim_company = pd.DataFrame([
    {
        "Symbol": "AAPL",
        "Name": "Apple Inc",
        "Description": "Apple Inc. is an American multinational technology company that specializes in consumer electronics, computer software, and online services. Apple is the world's largest technology company by revenue (totalling $274.5 billion in 2020) and, since January 2021, the world's most valuable company. As of 2021, Apple is the world's fourth-largest PC vendor by unit sales, and fourth-largest smartphone manufacturer. It is one of the Big Five American information technology companies, along with Amazon, Google, Microsoft, and Facebook.",
        "Sector": "TECHNOLOGY",
        "Industry": "ELECTRONIC COMPUTERS",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "2959053160000"
    },
    {
        "Symbol": "MSFT",
        "Name": "Microsoft Corporation",
        "Description": "Microsoft Corporation is an American multinational technology company which produces computer software, consumer electronics, personal computers, and related services. Its best known software products are the Microsoft Windows line of operating systems, the Microsoft Office suite, and the Internet Explorer and Edge web browsers. Its flagship hardware products are the Xbox video game consoles and the Microsoft Surface lineup of touchscreen personal computers. Microsoft ranked No. 21 in the 2020 Fortune 500 rankings of the largest United States corporations by total revenue; it was the world's largest software maker by revenue as of 2016. It is considered one of the Big Five companies in the U.S. information technology industry, along with Google, Apple, Amazon, and Facebook.",
        "Sector": "TECHNOLOGY",
        "Industry": "SERVICES-PREPACKAGED SOFTWARE",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "2734069121000"
    },
    {
        "Symbol": "GOOGL",
        "Name": "Alphabet Inc Class A",
        "Description": "Alphabet Inc. is an American multinational conglomerate headquartered in Mountain View, California. It was created through a restructuring of Google on October 2, 2015, and became the parent company of Google and several former Google subsidiaries. The two co-founders of Google remained as controlling shareholders, board members, and employees at Alphabet. Alphabet is the world's fourth-largest technology company by revenue and one of the world's most valuable companies.",
        "Sector": "TECHNOLOGY",
        "Industry": "SERVICES-COMPUTER PROGRAMMING, DATA PROCESSING, ETC.",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "1854733287000"
    },
    {
        "Symbol": "AMZN",
        "Name": "Amazon.com Inc",
        "Description": "Amazon.com, Inc. is an American multinational technology company which focuses on e-commerce, cloud computing, digital streaming, and artificial intelligence. It is one of the Big Five companies in the U.S. information technology industry, along with Google, Apple, Microsoft, and Facebook. The company has been referred to as one of the most influential economic and cultural forces in the world, as well as the world's most valuable brand.",
        "Sector": "TRADE & SERVICES",
        "Industry": "RETAIL-CATALOG & MAIL-ORDER HOUSES",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "1831806435000"
    },
    {
        "Symbol": "TSLA",
        "Name": "Tesla Inc",
        "Description": "Tesla, Inc. is an American electric vehicle and clean energy company based in Palo Alto, California. Tesla's current products include electric cars, battery energy storage from home to grid-scale, solar panels and solar roof tiles, as well as other related products and services. In 2020, Tesla had the highest sales in the plug-in and battery electric passenger car segments, capturing 16% of the plug-in market (which includes plug-in hybrids) and 23% of the battery-electric (purely electric) market. Through its subsidiary Tesla Energy, the company develops and is a major installer of solar photovoltaic energy generation systems in the United States. Tesla Energy is also one of the largest global suppliers of battery energy storage systems, with 3 GWh of battery storage supplied in 2020.",
        "Sector": "MANUFACTURING",
        "Industry": "MOTOR VEHICLES & PASSENGER CAR BODIES",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "776371372000"
    },
    {
        "Symbol": "META",
        "Name": "Meta Platforms Inc.",
        "Description": "Meta Platforms, Inc. develops products that enable people to connect and share with friends and family through mobile devices, PCs, virtual reality headsets, wearables and home devices around the world. The company is headquartered in Menlo Park, California.",
        "Sector": "TECHNOLOGY",
        "Industry": "SERVICES-COMPUTER PROGRAMMING, DATA PROCESSING, ETC.",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "1270579855000"
    },
    {
        "Symbol": "NFLX",
        "Name": "Netflix Inc",
        "Description": "Netflix, Inc. is an American over-the-top content platform and production company headquartered in Los Gatos, California. Netflix was founded in 1997 by Reed Hastings and Marc Randolph in Scotts Valley, California. The company's primary business is a subscription-based streaming service offering online streaming from a library of films and television series, including those produced in-house.",
        "Sector": "TRADE & SERVICES",
        "Industry": "SERVICES-VIDEO TAPE RENTAL",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "416220414000"
    },
    {
        "Symbol": "NVDA",
        "Name": "NVIDIA Corporation",
        "Description": "Nvidia Corporation is an American multinational technology company incorporated in Delaware and based in Santa Clara, California. It designs graphics processing units (GPUs) for the gaming and professional markets, as well as system on a chip units (SoCs) for the mobile computing and automotive market.",
        "Sector": "MANUFACTURING",
        "Industry": "SEMICONDUCTORS & RELATED DEVICES",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "2476355879000"
    },
    {
        "Symbol": "INTC",
        "Name": "Intel Corporation",
        "Description": "Intel Corporation is an American multinational corporation and technology company headquartered in Santa Clara, California, in Silicon Valley. It is the world's largest semiconductor chip manufacturer by revenue, and is the developer of the x86 series of microprocessors, the processors found in most personal computers (PCs).",
        "Sector": "MANUFACTURING",
        "Industry": "SEMICONDUCTORS & RELATED DEVICES",
        "Exchange": "NASDAQ",
        "Country": "USA",
        "Currency": "USD",
        "MarketCapitalization": "82545967000"
    }
])
dim_company

Unnamed: 0,Symbol,Name,Description,Sector,Industry,Exchange,Country,Currency,MarketCapitalization
0,AAPL,Apple Inc,Apple Inc. is an American multinational techno...,TECHNOLOGY,ELECTRONIC COMPUTERS,NASDAQ,USA,USD,2959053160000
1,MSFT,Microsoft Corporation,Microsoft Corporation is an American multinati...,TECHNOLOGY,SERVICES-PREPACKAGED SOFTWARE,NASDAQ,USA,USD,2734069121000
2,GOOGL,Alphabet Inc Class A,Alphabet Inc. is an American multinational con...,TECHNOLOGY,"SERVICES-COMPUTER PROGRAMMING, DATA PROCESSING...",NASDAQ,USA,USD,1854733287000
3,AMZN,Amazon.com Inc,"Amazon.com, Inc. is an American multinational ...",TRADE & SERVICES,RETAIL-CATALOG & MAIL-ORDER HOUSES,NASDAQ,USA,USD,1831806435000
4,TSLA,Tesla Inc,"Tesla, Inc. is an American electric vehicle an...",MANUFACTURING,MOTOR VEHICLES & PASSENGER CAR BODIES,NASDAQ,USA,USD,776371372000
5,META,Meta Platforms Inc.,"Meta Platforms, Inc. develops products that en...",TECHNOLOGY,"SERVICES-COMPUTER PROGRAMMING, DATA PROCESSING...",NASDAQ,USA,USD,1270579855000
6,NFLX,Netflix Inc,"Netflix, Inc. is an American over-the-top cont...",TRADE & SERVICES,SERVICES-VIDEO TAPE RENTAL,NASDAQ,USA,USD,416220414000
7,NVDA,NVIDIA Corporation,Nvidia Corporation is an American multinationa...,MANUFACTURING,SEMICONDUCTORS & RELATED DEVICES,NASDAQ,USA,USD,2476355879000
8,INTC,Intel Corporation,Intel Corporation is an American multinational...,MANUFACTURING,SEMICONDUCTORS & RELATED DEVICES,NASDAQ,USA,USD,82545967000


In [13]:
fact_stock_prices["Volume"] = fact_stock_prices["Volume"].astype("Int64")
fact_stock_prices.to_csv("/kaggle/working/fact_stock_prices.csv", index=False)
dim_date.to_csv("/kaggle/working/dim_date.csv", index=False)
dim_company.to_csv("dim_company_clean.csv", index=False)
dim_financials.to_csv("/kaggle/working/dim_financials.csv", index=False)


dim_company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Symbol                9 non-null      object
 1   Name                  9 non-null      object
 2   Sector                9 non-null      object
 3   Industry              9 non-null      object
 4   Exchange              9 non-null      object
 5   Country               9 non-null      object
 6   Currency              9 non-null      object
 7   MarketCapitalization  9 non-null      object
dtypes: object(8)
memory usage: 708.0+ bytes
