In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd

# --- SETTINGS ---
symbol = "AAPL"  # Example: Apple
ALPHA_KEY = "YOUR_ALPHA_VANTAGE_API_KEY"

# --- TRY ALPHA VANTAGE FIRST ---
try:
    url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={symbol}&apikey={ALPHA_KEY}"
    r = requests.get(url, timeout=20)
    r.raise_for_status()
    js = r.json()

    key = [k for k in js.keys() if "Time Series" in k]
    assert key, f"Unexpected response keys: {list(js.keys())}"

    series = js[key[0]]
    df_api = (
        pd.DataFrame(series).T
        .rename_axis("date")
        .reset_index()
        .rename(columns={
            "1. open": "Open",
            "2. high": "High",
            "3. low": "Low",
            "4. close": "Close",
            "5. volume": "Volume"
        })
    )

    print("✅ Data from Alpha Vantage API")
    print(df_api.head())

except Exception as e:
    print("⚠ API failed, falling back to Yahoo Finance scrape:", e)

    # --- FALLBACK: SCRAPE YAHOO FINANCE ---
    scrape_url = f"https://finance.yahoo.com/quote/{symbol}/history"
    headers = {"User-Agent": "Mozilla/5.0"}
    
    resp = requests.get(scrape_url, headers=headers, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    table = soup.find("table")
    
    rows = []
    for tr in table.find_all("tr"):
        cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
        if cells:
            rows.append(cells)
    
    header, *data = rows
    df_scrape = pd.DataFrame(data, columns=header)

    print("✅ Data from Yahoo Finance scraping")
    print(df_scrape.head())


✅ Data from Alpha Vantage API
         date      Open      High       Low     Close     Volume
0  2025-08-14  234.0550  235.1200  230.8500  232.7800   51916275
1  2025-08-13  231.0700  235.0000  230.4300  233.3300   69878546
2  2025-08-12  228.0050  230.8000  227.0700  229.6500   55672301
3  2025-08-11  227.9200  229.5600  224.7600  227.1800   61806132
4  2025-08-08  220.8300  231.0000  219.2500  229.3500  113853967


# Project for Bootcamp NYU
Data gathering, cleansing, preprocessing, and maching learning analytics
I use Kaggle repositories to get training and testing set, for this project I use Titanic Project (https://www.kaggle.com/competitions/titanic)

Purpose =  use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

for more information please refer to Readme File

In [2]:
#import library for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

In [11]:
# load titanic dataset from within seaborn (no csv file)
titanic_df = sns.load_dataset('titanic')
print(titanic_df.shape) # (891, 15)
titanic_df.head(2)

(891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


In [None]:
# load the train_titanic.csv file provided by Kaggle
tk_df = pd.read_csv('data/train_titanic.csv')
print(tk_df.shape) # (891, 12)
tk_df.head(2)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


Based on 

In [12]:
titanic_df.describe()
tk_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
# get info about the df:
tk_df.isna().sum()
# L@@K: we are missing a lot of ages and almost all Cabin values are null
# the move is to drop Cabin column altogether and drop missing Age rows
# BUT definitely DO KEEP Age column

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
# drop the Cabin column:
tk_df.drop(columns=['Cabin'],inplace=True)