### Load api_key from `.env/`

In [4]:
import os
from dotenv import load_dotenv
import requests

load_dotenv()

api_key = os.getenv("API_KEY")

### API Pull

In [13]:
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_MONTHLY_ADJUSTED&symbol=IBM&apikey={api_key}'
r = requests.get(url)
data = r.json()

data['Monthly Adjusted Time Series']

{'2025-08-20': {'1. open': '251.4050',
  '2. high': '255.0000',
  '3. low': '233.3600',
  '4. close': '242.5500',
  '5. adjusted close': '242.5500',
  '6. volume': '80443917',
  '7. dividend amount': '1.6800'},
 '2025-07-31': {'1. open': '294.5500',
  '2. high': '295.6100',
  '3. low': '252.2200',
  '4. close': '253.1500',
  '5. adjusted close': '251.4066',
  '6. volume': '109055173',
  '7. dividend amount': '0.0000'},
 '2025-06-30': {'1. open': '257.8500',
  '2. high': '296.1600',
  '3. low': '257.2200',
  '4. close': '294.7800',
  '5. adjusted close': '292.7500',
  '6. volume': '74395935',
  '7. dividend amount': '0.0000'},
 '2025-05-30': {'1. open': '241.4400',
  '2. high': '269.2800',
  '3. low': '237.9450',
  '4. close': '259.0600',
  '5. adjusted close': '257.2759',
  '6. volume': '78164014',
  '7. dividend amount': '1.6800'},
 '2025-04-30': {'1. open': '248.0300',
  '2. high': '252.7900',
  '3. low': '214.5000',
  '4. close': '241.8200',
  '5. adjusted close': '238.5465',
  '6. 

In [14]:
import pandas as pd

df = pd.DataFrame.from_dict(time_series, orient="index")
df.index = pd.to_datetime(df.index)
df = df.sort_index()

df.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount
1999-12-31,102.56,122.12,102.25,107.87,53.6058,158626300,0.0
2000-01-31,112.44,124.75,109.62,112.25,55.7824,175259600,0.0
2000-02-29,112.37,119.75,100.94,102.75,51.113,133524400,0.12
2000-03-31,102.0,128.25,99.5,118.37,58.8832,194329000,0.0
2000-04-28,120.0,128.0,101.25,111.5,55.4657,168464800,0.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 309 entries, 1999-12-31 to 2025-08-20
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   1. open             309 non-null    object
 1   2. high             309 non-null    object
 2   3. low              309 non-null    object
 3   4. close            309 non-null    object
 4   5. adjusted close   309 non-null    object
 5   6. volume           309 non-null    object
 6   7. dividend amount  309 non-null    object
dtypes: object(7)
memory usage: 19.3+ KB


In [17]:
df.describe()

Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount
count,309.0,309.0,309.0,309.0,309.0,309,309.0
unique,297.0,306.0,303.0,307.0,309.0,309,28.0
top,145.0,147.5,99.5,112.25,53.6058,158626300,0.0
freq,3.0,2.0,2.0,2.0,1.0,1,206.0


In [18]:
df.columns = [col.split(" ", 1)[1] for col in df.columns]
df.head()

Unnamed: 0,open,high,low,close,adjusted close,volume,dividend amount
1999-12-31,102.56,122.12,102.25,107.87,53.6058,158626300,0.0
2000-01-31,112.44,124.75,109.62,112.25,55.7824,175259600,0.0
2000-02-29,112.37,119.75,100.94,102.75,51.113,133524400,0.12
2000-03-31,102.0,128.25,99.5,118.37,58.8832,194329000,0.0
2000-04-28,120.0,128.0,101.25,111.5,55.4657,168464800,0.0


In [21]:
import os


required_cols = ['open', 'high', 'low', 'close', 'adjusted close', 'volume',
       'dividend amount']

print("Validation Results:")
print("Required columns present:", all(col in df.columns for col in required_cols))
print("NA counts:\n", df.isna().sum())
print("Shape:", df.shape)


data_raw = "../data/raw/"
os.makedirs(data_raw, exist_ok=True)
raw_path = os.path.join(data_raw, "time_series_raw.csv")
df.to_csv(raw_path)
print(f"Saved to {raw_path}")


Validation Results:
Required columns present: True
NA counts:
 open               0
high               0
low                0
close              0
adjusted close     0
volume             0
dividend amount    0
dtype: int64
Shape: (309, 7)
Saved to ../data/raw/time_series_raw.csv


In [24]:
df = df.apply(pd.to_numeric, errors="coerce")

if df["volume"].isna().sum() == 0:
    df["volume"] = df["volume"].astype(int)

print(df.dtypes)

open               float64
high               float64
low                float64
close              float64
adjusted close     float64
volume               int64
dividend amount    float64
dtype: object


### Web Scraping

In [34]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

response = requests.get("https://web-scraping.dev/product/1")
html = response.text
soup = BeautifulSoup(html, "lxml")

table = soup.find_all('table', {'class': 'table-product'})[1]

headers = [el.text.strip() for el in table.find_all('tr')[0].find_all('th')]
rows = []
for row in table.find_all('tr')[1:]:
    rows.append([el.text.strip() for el in row.find_all('td')])

df = pd.DataFrame(rows, columns=headers)
df.to_csv("../data/raw/table.csv", index=False)


In [36]:
df = pd.read_csv("../data/raw/table.csv")
df.head()

Unnamed: 0,Version,Package Weight,Package Dimension,Variants,Delivery Type
0,Pack 1,"1,00 kg",100x230 cm,6 available,1 Day shipping
1,Pack 2,"2,11 kg",200x460 cm,6 available,1 Day shipping
2,Pack 3,"3,22 kg",300x690 cm,6 available,1 Day shipping
3,Pack 4,"4,33 kg",400x920 cm,6 available,1 Day shipping
4,Pack 5,"5,44 kg",500x1150 cm,6 available,1 Day shipping


In [1]:
import os
gitignore_path = ".gitignore"

if os.path.exists(gitignore_path):
    with open(gitignore_path, "r") as f:
        lines = [line.strip() for line in f.readlines()]
    if ".env" in lines:
        print(".env is ignored in .gitignore")
    else:
        print(".env is NOT ignored in .gitignore")
else:
    print(".gitignore file not found")

.env is ignored in .gitignore
