In [1]:
!pip install boto3 -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/140.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/14.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/14.6 MB[0m [31m115.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m9.9/14.6 MB[0m [31m139.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m14.6/14.6 MB[0m [31m177.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m14.6/14.6 MB[0m [31m177.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0

In [2]:
# AWS params
from google.colab import userdata
AWS_PARAMS ={
    'aws_access_key_id': userdata.get('AWS_ACCESS_KEY_ID'),
    'aws_secret_access_key': userdata.get('AWS_SECRET_ACCESS_KEY'),
    'region_name': userdata.get('AWS_REGION')
}

In [3]:
import pandas as pd
import boto3
from io import BytesIO
from datetime import datetime

In [4]:
def get_s3_client():
  return boto3.client('s3', **AWS_PARAMS)

In [5]:
# consoladition (joining all parquet files in the same dataframe)
def read_silver_data(s3_client, bucket_name):

  prefix_silver = 'silver/yahoo_finance/market_analysis/'

  # manual crawler
  response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix_silver)

  if 'Contents' not in response:
    print('No objects found in the specified prefix.')
    return None

  all_dfs=[]

  # reading all parquet files from s3 bucket
  for obj in response['Contents']:
    if obj['Key'].endswith('.parquet'):
      file_obj = s3_client.get_object(Bucket=bucket_name, Key=obj['Key'])
      df_temporary = pd.read_parquet(BytesIO(file_obj['Body'].read()))
      all_dfs.append(df_temporary)

  return pd.concat(all_dfs, ignore_index=True) if all_dfs else None

In [6]:
def apply_bussines_rules_gold(df):

  # market cap ranking (biggest corporation in my data lake)
  df['market_cap_rank'] = df['market_cap'].rank(ascending=False).astype(int)

  # upside (valorization potential)
  if 'price' in df.columns and 'target_price' in df.columns:
    df['upside_pct'] = ((df['target_price'] / df['price']) - 1) * 100
  else:
    df['upside_pct'] = 0 # Fallback if columns dosnt exists

  return df

In [7]:
def final_dataframe_columns_organization(df):
  cols = ['ticker', 'company_name', 'price', 'market_cap', 'market_cap_rank', 'upside_pct', 'processed_at']
  existing_cols = [col for col in cols if col in df.columns]
  df_gold = df[existing_cols].sort_values(by='market_cap_rank')

  return df_gold

In [8]:
def save_gold_to_s3(s3_client, df, bucket_name):
  date_str = datetime.now().strftime('%Y-%m-%d')
  gold_key = f'gold/stock_report/final_market_analysis_{date_str}.parquet'

  buffer = BytesIO()
  df.to_parquet(buffer, index=False)

  s3_client.put_object(Bucket=bucket_name, Key=gold_key, Body=buffer.getvalue())

  return gold_key

In [9]:
# Orchestrator Function
def run_gold_pipeline(bucket_name):
  print('Starting Gold Pipeline...')
  s3 = get_s3_client()

  df = read_silver_data(s3, bucket_name)

  if df is not None:
    df = apply_bussines_rules_gold(df)

    df_final = final_dataframe_columns_organization(df)

    key = save_gold_to_s3(s3, df_final, bucket_name)
    print(f'Gold Layer created: {key}')

    return df_final
  else:
    print('No data to process')
    return None

In [12]:
if __name__ == '__main__':
  bucket_name = 'stock-market-monitoring'
  df_gold = run_gold_pipeline(bucket_name)

  if df_gold is not None:
    print(df_gold.head(20))
  else:
    print('No data to process')

Starting Gold Pipeline...
Gold Layer created: gold/stock_report/final_market_analysis_2026-01-16.parquet
   ticker                                       company_name     price  \
14   NVDA                                 NVIDIA Corporation  187.9250   
13   NVDA                                 NVIDIA Corporation  181.9415   
5   GOOGL                                      Alphabet Inc.  335.0200   
6   GOOGL                                      Alphabet Inc.  329.6800   
0    AAPL                                         Apple Inc.  260.2500   
1    AAPL                                         Apple Inc.  259.5700   
2    AAPL                                         Apple Inc.  255.9900   
10   MSFT                              Microsoft Corporation  461.1500   
9    MSFT                              Microsoft Corporation  458.3250   
4    AMZN                                   Amazon.com, Inc.  238.2000   
3    AMZN                                   Amazon.com, Inc.  237.5025   
17    T