In [None]:
#Importing libraries

from google.cloud import storage
from io import StringIO
import pandas as pd

#Source for the files

source_bucket_name = "my-big-data-as"

#Create an object for cloud storage

storage_client = storage.Client()

#List of files within the bucket, under the landing folder

blobs = storage_client.list_blobs(source_bucket_name, prefix="landing")

#Data cleaning function

def clean_data(df, ticker_symbol):
    
    #Add Ticker Symbol Column
    df['ticker_symbol'] = ticker_symbol
    
    #Convert datetime column to pandas datetime
    df['datetime'] = pd.to_datetime(df['datetime']) 
    
    #Keep needed columns/attributes only
    
    columns_to_keep = ['datetime','ticker_symbol', 'high', 'close', 'open', 'volume', 'obv_0', 'mom_3', 
                       'ema_3', 'bbands_3_upperband', 'bbands_3_middleband', 'bbands_3_lowerband']
    df = df[columns_to_keep]
    
    #Remove nulls
    
    df = df.dropna()
    
    #Printing head to check DF
    
    print(df.head())  
    
    #Returning back to DF
    
    return df

#Loop through all CSV files using for loop

for blob in blobs:
    if blob.name.endswith('.csv'):
        print(f"Processing file: {blob.name}")

        #Read the CSV into DF
        
        csv_data = blob.download_as_text()
        df = pd.read_csv(StringIO(csv_data))
        
        #Extract the ticker symbol
        filename = blob.name.split('/')[-1]  
        ticker_symbol = filename.split('_')[0] 

        #Cleaning the data by calling function
        
        df = clean_data(df, ticker_symbol)

        #Writing the cleaned DF to the cleaned folder as a Parquet file
        
        cleaned_file_path = f"gs://{source_bucket_name}/cleaned/{blob.name.split('/')[-1].replace('.csv', '.parquet')}"
        df.to_parquet(cleaned_file_path, index=False)
        print(f"Cleaned data written to: {cleaned_file_path}")

Processing file: landing/D1/A.US_D1.csv
    datetime ticker_symbol   high  close   open   volume       obv_0  mom_3  \
3 1999-11-23          A.US  42.94  40.38  42.50  3980200  18589000.0  -2.06   
4 1999-11-24          A.US  41.94  41.00  40.13  3369900  21958900.0   0.31   
5 1999-11-26          A.US  41.50  41.19  40.88  1235800  23194700.0  -1.88   
6 1999-11-29          A.US  42.44  41.94  41.00  2823500  26018200.0   1.56   
7 1999-11-30          A.US  42.94  42.25  42.00  2879800  28898000.0   1.25   

   ema_3  bbands_3_upperband  bbands_3_middleband  bbands_3_lowerband  
3  41.22               43.78                41.38               38.98  
4  41.11               43.78                41.48               39.18  
5  41.15               41.55                40.86               40.16  
6  41.55               42.19                41.38               40.57  
7  41.90               42.68                41.79               40.90  
Cleaned data written to: gs://my-big-data-as/cleaned/

    datetime ticker_symbol   high  close   open   volume      obv_0  mom_3  \
3 1998-01-07       ADBE.US  42.50  41.81  42.38  2135003  1176030.0   1.37   
4 1998-01-08       ADBE.US  42.76  42.19  41.63  1072904  2248934.0   1.19   
5 1998-01-09       ADBE.US  42.19  41.19  42.00   786202  1462732.0  -1.31   
6 1998-01-12       ADBE.US  41.88  40.69  40.38  1065002   397730.0  -1.12   
7 1998-01-13       ADBE.US  41.00  40.00  40.75  1070207  -672477.0  -2.19   

   ema_3  bbands_3_upperband  bbands_3_middleband  bbands_3_lowerband  
3  41.56               43.00                41.77               40.54  
4  41.88               42.73                42.17               41.60  
5  41.53               42.55                41.73               40.91  
6  41.11               42.60                41.36               40.11  
7  40.56               41.60                40.63               39.65  
Cleaned data written to: gs://my-big-data-as/cleaned/ADBE.US_D1.parquet
Processing file: landing/D1

    datetime ticker_symbol    high   close    open   volume      obv_0  mom_3  \
3 1998-01-07        AIG.US  109.07  107.88  109.06  1100400 -3316703.0  -2.00   
4 1998-01-08        AIG.US  107.69  107.31  107.50   896000 -4212703.0  -2.38   
5 1998-01-09        AIG.US  107.50  103.19  107.13  1374206 -5586909.0  -5.81   
6 1998-01-12        AIG.US  101.94  101.19  101.00  2298711 -7885620.0  -6.69   
7 1998-01-13        AIG.US  103.13  101.00  101.50  1749023 -9634643.0  -6.31   

    ema_3  bbands_3_upperband  bbands_3_middleband  bbands_3_lowerband  
3  108.70              110.35               108.86              107.36  
4  108.01              109.47               108.06              106.66  
5  105.60              110.31               106.13              101.95  
6  103.39              108.99               103.90               98.80  
7  102.20              103.77               101.79               99.81  
Cleaned data written to: gs://my-big-data-as/cleaned/AIG.US_D1.parquet
Proc

    datetime ticker_symbol   high  close   open  volume     obv_0  mom_3  \
3 2007-05-02       AMCR.US  33.84  33.74  33.49  345200  637800.0  -0.20   
4 2007-05-03       AMCR.US  33.89  33.63  33.75  347900  289900.0   0.42   
5 2007-05-04       AMCR.US  33.89  33.55  33.88  304900  -15000.0   0.07   
6 2007-05-07       AMCR.US  33.80  33.48  33.49  219500 -234500.0  -0.26   
7 2007-05-08       AMCR.US  33.41  33.30  33.33  298500 -533000.0  -0.33   

   ema_3  bbands_3_upperband  bbands_3_middleband  bbands_3_lowerband  
3  33.64               33.91                33.48               33.04  
4  33.64               33.83                33.62               33.40  
5  33.59               33.80                33.64               33.48  
6  33.54               33.68                33.55               33.43  
7  33.42               33.65                33.44               33.23  
Cleaned data written to: gs://my-big-data-as/cleaned/AMCR.US_D1.parquet
Processing file: landing/D1/AMD.US_D1.c

Cleaned data written to: gs://my-big-data-as/cleaned/ANSS.US_D1.parquet
Processing file: landing/D1/AON.US_D1.csv
    datetime ticker_symbol   high  close   open  volume     obv_0  mom_3  \
3 1998-01-07        AON.US  58.56  58.25  57.44  219000  309160.0   0.12   
4 1998-01-08        AON.US  58.09  57.22  58.09   87900  221260.0  -0.25   
5 1998-01-09        AON.US  57.31  55.00  57.25  286900  -65640.0  -3.07   
6 1998-01-12        AON.US  56.25  56.00  54.63  207590  141950.0  -2.25   
7 1998-01-13        AON.US  56.98  56.50  56.00  257500  399450.0  -0.72   

   ema_3  bbands_3_upperband  bbands_3_middleband  bbands_3_lowerband  
3  58.07               58.60                57.93               57.26  
4  57.64               58.75                57.85               56.95  
5  56.32               59.54                56.82               54.11  
6  56.16               57.89                56.07               54.26  
7  56.33               57.08                55.83               54.59