# Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
# TODO: da rifare per ogni stock
sheet_names = [
    'Info',
    'Historical',
    'Income Statement',
    'Quarterly Income Statement',
    'Cashflow',
    'Institutional Holders',
    'Mutual Fund Holders',
    'Major Holders'
]

#riempire stocks di tutti i vari codici, fare la retrive di tutti i file e buttarli in df_stock per poi poter lavorare su tutti i dati
directory = "./data"
stocks = [os.path.join(directory, file) for file in os.listdir(directory)]
stocks

['./data/GS.xlsx',
 './data/1398.HK.xlsx',
 './data/BA.xlsx',
 './data/005380.KS.xlsx',
 './data/AIR.PA.xlsx',
 './data/KO.xlsx',
 './data/WMT.xlsx',
 './data/005930.KS.xlsx',
 './data/NOVN.SW.xlsx',
 './data/CVX.xlsx',
 './data/EGHT.xlsx',
 './data/MMM.xlsx',
 './data/VZ.xlsx',
 './data/JPM.xlsx',
 './data/JNJ.xlsx',
 './data/AMZN.xlsx',
 './data/9984.T.xlsx',
 './data/RDS.A.xlsx',
 './data/0700.HK.xlsx',
 './data/BE.xlsx',
 './data/AAPL.xlsx',
 './data/PFE.xlsx',
 './data/SEDG.xlsx',
 './data/MSFT.xlsx',
 './data/TM.xlsx',
 './data/VOW3.DE.xlsx',
 './data/PG.xlsx',
 './data/DB.xlsx',
 './data/UL.xlsx',
 './data/TWTR.xlsx',
 './data/XOM.xlsx',
 './data/T.xlsx']

## Integrazione dei dati finanziari
Colonne aggiunte:
- **Daily_Return**: rendimento giornaliero.
- **Target_1day**: indica se il prezzo di chiusura del giorno successivo sarà superiore (1) o inferiore (0) rispetto al prezzo di chiusura del giorno corrente.
- **Target_5days**: indica se il prezzo di chiusura a 5 giorni nel futuro sarà superiore (1) o inferiore (0) rispetto al prezzo di chiusura del giorno corrente.
- **Target_30days**: indica se il prezzo di chiusura a 30 giorni nel futuro sarà superiore (1) o inferiore (0) rispetto al prezzo di chiusura del giorno corrente.

Integrato i vari sheet "Income Statement", "Quarterly Income Statement" e "Cashflow" in un singolo excel. NB: Dato che questi fogli contengono dati finanziari annuali o trimestrali un approccio comune è portare avanti l'ultimo valore noto per ogni giorno fino a quando non si dispone di un nuovo valore. Per alcunii anni finanziari sarà Nan perché non li abbiamo.

Lista delle azioni alle quali mancano pezzi:
- **1398.HK** manca income_statement.normalized_EBITA

# colonne aggiunte
dato che alcune colonne non sono presenti all'interno di tutti i file, pensavamo di aggiungere una serie di colonne utili per l'analisi che saranno presenti ovunque in quanto le aziende sono obbligate a pubblicare questo tipo di dati (i nomi potrebbero non essere gli stessi):
- **Net income** (income statement): This metric measures a company's profit after all expenses and taxes have been paid. It is the most important metric for investors, as it represents the company's bottom line.

- **Diluted EPS** (income statement): This metric measures a company's profit per share of common stock. It is a good measure of a company's profitability per share.

- **Total Revenue** (income statement): This metric measures the total amount of sales that a company generates. It is a good measure of a company's top line growth.

- **Cost of revenue** (income statement): This metric measures the cost of the goods that a company sells. It is an important metric for assessing a company's profitability.

- **Operating revenue** (income statement): This metric measures the non-production costs that a company incurs. It is important to consider operating expenses when assessing a company's profitability and cash flow generation.

- **Cash flow from operating activities** (cash flow): This metric measures the amount of cash that a company generates from its core business operations.

- **Cash flow from investing activities** (cash flow): This metric measures the amount of cash that a company generates from its investments, such as the sale of property, plant, and equipment

- **Cash flow from financing activities** (cash flow): This metric measures the amount of cash that a company generates from its financing activities, such as the issuance of debt or equity.

In [39]:
i = 0
for file in stocks:
    income_stmt = set([
        "Net Income",
        "Diluted EPS",
        "Total Revenue",
        "Cost of revenue",
        "Operating revenue",
    ])
    
    cash_flow = set([
        "Cash flow from operating activities",
        "Cash flow from investing activities",
        "Cash flow from financing activities",
    ])
    
    df_stock = pd.ExcelFile(file)
    df_stock = df_stock.parse("Income Statement")
    
    cols_income_stmt = set(df_stock['Unnamed: 0'].tolist())
    
    df_stock = pd.ExcelFile(file)
    df_stock = df_stock.parse("Cashflow")
    cols_cash_flow = set(df_stock['Unnamed: 0'].tolist())

    elements_in_sublist_but_not_big_list = [element for element in cash_flow if element not in cols_cash_flow]
    print(f"action: {file} {elements_in_sublist_but_not_big_list}")

action: ./data/GS.xlsx ['Cash flow from operating activities', 'Cash flow from investing activities', 'Cash flow from financing activities']
action: ./data/1398.HK.xlsx ['Cash flow from operating activities', 'Cash flow from investing activities', 'Cash flow from financing activities']
action: ./data/BA.xlsx ['Cash flow from operating activities', 'Cash flow from investing activities', 'Cash flow from financing activities']
action: ./data/005380.KS.xlsx ['Cash flow from operating activities', 'Cash flow from investing activities', 'Cash flow from financing activities']
action: ./data/AIR.PA.xlsx ['Cash flow from operating activities', 'Cash flow from investing activities', 'Cash flow from financing activities']
action: ./data/KO.xlsx ['Cash flow from operating activities', 'Cash flow from investing activities', 'Cash flow from financing activities']
action: ./data/WMT.xlsx ['Cash flow from operating activities', 'Cash flow from investing activities', 'Cash flow from financing activitie

In [13]:
# TODO: da fare per ogni stock
counter = 0
df = pd.DataFrame()
for file in stocks:
    if file.split("/")[2][:-5] != "1398.HK":
        df_stock = pd.ExcelFile(file)
        
        # prevent false postive warnings, reference_ https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
        pd.options.mode.chained_assignment = None # default='warn'
        
        # Loading the 'Historical' data stock
        historical_data = df_stock.parse('Historical')
        
        # Renaming and setting the Date column
        historical_data.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
        historical_data['Date'] = pd.to_datetime(historical_data['Date'])
        historical_data.set_index('Date', inplace=True)
        # Calculate daily return
        historical_data['Daily_Return'] = historical_data['Close'].pct_change()
        
        # Create target variables for next day, next 5 days and next 30 days
        historical_data['Target_1day'] = (historical_data['Close'].shift(-1) > historical_data['Close']).astype(int)
        historical_data['Target_5days'] = (historical_data['Close'].shift(-5) > historical_data['Close']).astype(int)
        historical_data['Target_30days'] = (historical_data['Close'].shift(-30) > historical_data['Close']).astype(int)
        
        # Drop rows with NaN values (will be present due to the shifting for target creation)
        historical_data = historical_data.dropna()
        
        # Loading the 'Income Statement' data for XOM
        income_statement = df_stock.parse('Income Statement')
        
        # Transposing the data for easier integration
        income_statement = income_statement.set_index('Unnamed: 0').transpose()
        income_statement.index = pd.to_datetime(income_statement.index)
        
        
        # Selecting some of the key financial metrics (you can add or remove based on relevance)
        selected_metrics = [
            'Normalized EBITDA',
            'Total Unusual Items',
            'Total Unusual Items Excluding Goodwill'
        ]
        
        # check if columns exist, in case create them
        for metric in selected_metrics:
            if metric not in income_statement.columns:
                income_statement[metric] = np.nan
                
        
        income_statement = income_statement[selected_metrics]
        
        # Merging the income statement data with the historical data
        merged_data = historical_data.join(income_statement, how='left')
        
        # Forward filling the NaN values
        merged_data[selected_metrics] = merged_data[selected_metrics].fillna(method='ffill')
        
        # Loading the 'Cashflow' data for XOM
        cashflow = df_stock.parse('Cashflow')
        
        # Transposing the data for easier integration
        cashflow = cashflow.set_index('Unnamed: 0').transpose()
        cashflow.index = pd.to_datetime(cashflow.index)
        
        # Selecting some of the key cashflow metrics (you can add or remove based on relevance)
        selected_cashflow_metrics = [
            'Operating Cash Flow',
            'Capital Expenditure',
            'Free Cash Flow'
        ]
        
        for metric in selected_cashflow_metrics:
            if metric not in cashflow.columns:
                cashflow[metric] = np.nan
        
        cashflow = cashflow[selected_cashflow_metrics]
        
        # Merging the cashflow data with the existing dataframe
        merged_data = merged_data.join(cashflow, how='left', rsuffix='_cashflow')
        
        # Forward filling the NaN values
        merged_data[selected_cashflow_metrics] = merged_data[selected_cashflow_metrics].fillna(method='ffill')
        
        if 'Ticker' not in merged_data.columns:
            merged_data['Ticker'] = file.split("/")[2].replace(".", "")[:-4]
        
        # Display the updated dataframe with integrated cashflow metrics
        merged_data.iloc[counter : counter + len(merged_data), merged_data.columns.get_loc("Ticker")] = file.split("/")[2].replace(".", "")[:-4]
        
        counter = len(merged_data)

        # Moving Averages
        merged_data['MA_5'] = merged_data['Close'].rolling(window=5).mean()
        merged_data['MA_10'] = merged_data['Close'].rolling(window=10).mean()
        merged_data['MA_30'] = merged_data['Close'].rolling(window=30).mean()
        merged_data['MA_50'] = merged_data['Close'].rolling(window=50).mean()
        
        # RSI
        delta = merged_data['Close'].diff()
        gain = (delta.where(delta > 0, 0)).fillna(0)
        loss = (-delta.where(delta < 0, 0)).fillna(0)
        avg_gain = gain.rolling(window=14).mean()
        avg_loss = loss.rolling(window=14).mean()
        rs = avg_gain / avg_loss
        merged_data['RSI'] = 100 - (100 / (1 + rs))
        
        # MACD
        merged_data['MACD'] = merged_data['Close'].ewm(span=12, adjust=False).mean() - merged_data['Close'].ewm(span=26, adjust=False).mean()
        merged_data['Signal_Line'] = merged_data['MACD'].ewm(span=9, adjust=False).mean()
        
        # Bollinger Bands
        merged_data['Bollinger_Mid_Band'] = merged_data['Close'].rolling(window=20).mean()
        merged_data['Bollinger_Upper_Band']  = merged_data['Bollinger_Mid_Band'] + 1.96*merged_data['Close'].rolling(window=20).std()
        merged_data['Bollinger_Lower_Band']  = merged_data['Bollinger_Mid_Band'] - 1.96*merged_data['Close'].rolling(window=20).std()
        
        # Volatility
        merged_data['Volatility'] = merged_data['Daily_Return'].rolling(window=5).std()
        
        to_drop_na = ['MA_5', 'MA_10', 'MA_30', 'MA_50', 'RSI', 'Volatility']
        
        for column in to_drop_na:
            merged_data[column] = merged_data[column].fillna(0)

        merged_data = merged_data[merged_data.index >= '2020-06-30']
        #indices_to_drop = merged_data.index[merged_data.isna().sum(axis=1) > 3].tolist()
        
        #merged_data.drop(indices_to_drop, inplace=True)
        
        # Export in Excel company data
        if not os.path.exists('./Processed'):
            os.makedirs('./Processed')
        with pd.ExcelWriter(f'./Processed/{file.split("/")[2][:-5]}.xlsx', mode = "w", engine = "openpyxl") as writer:
            merged_data.to_excel(writer, sheet_name="Sheet1")
        # Append to one single dataframe
        df = pd.concat([df, merged_data])

# aggiornamento
dato che alcune aziende hanno delle colonne completamente vuote, eliminare i record con i NaN implcava che l'intera azienda non sarebbe stata considerata.
per questo motivo abbiamo deciso si non rimuovere le aziende, ma bensì non inserire nel dataset quelle colonne; in particola le colonne che devono essere rimosse sono:
- `Normalized EBITDA`
- `Total Unusual Items`
- `Total Unusual Items Excluding Goodwill`
- `Operating Cash Flow`
- `Capital Expenditure`
- `Free Cash Flow`
di tutte queste colonne alcune in realtà possono essere lasciate, bisogna valutare bene quali

In [6]:
# TODO: spiegare perchè togliamo i quarterly
#merged_data.drop(columns=['Normalized EBITDA_quarterly', 'Total Unusual Items_quarterly', 'Total Unusual Items Excluding Goodwill_quarterly'], inplace=True)
merged_data.iloc[1 : counter + len(merged_data), merged_data.columns.get_loc("Ticker")] = file.split("/")[2][:-5]
merged_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily_Return,Target_1day,Target_5days,...,MA_10,MA_30,MA_50,RSI,MACD,Signal_Line,Bollinger_Mid_Band,Bollinger_Upper_Band,Bollinger_Lower_Band,Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-30,17.141609,17.330104,16.970250,17.267273,57979814,0.0,0.0,0.010699,0,1,...,17.115333,17.523739,17.266358,35.285211,-0.130556,-0.053096,17.583143,18.883390,16.282896,0.023715
2020-07-01,17.210154,17.410073,17.050219,17.078779,41007987,0.0,0.0,-0.010916,1,1,...,17.094770,17.541636,17.266700,47.216598,-0.131006,-0.068678,17.537162,18.840489,16.233835,0.019738
2020-07-02,17.198729,17.421497,17.147322,17.181593,49947900,0.0,0.0,0.006020,1,1,...,17.079348,17.550966,17.273669,45.531948,-0.121664,-0.079275,17.488039,18.766713,16.209365,0.019464
2020-07-06,17.444342,17.478614,17.272982,17.415783,44639852,0.0,0.0,0.013630,0,0,...,17.089630,17.564675,17.284979,49.902144,-0.094276,-0.082275,17.422923,18.568780,16.277066,0.014230
2020-07-07,17.307257,17.392937,17.267274,17.318682,36422181,0.0,0.0,-0.005575,1,1,...,17.101625,17.573052,17.291947,45.400056,-0.079490,-0.081718,17.339814,18.232285,16.447342,0.010587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-09,14.470000,14.750000,14.460000,14.730000,30659800,0.0,0.0,0.019377,1,0,...,14.580206,14.557477,14.300160,45.736901,0.044443,0.082328,14.684344,15.194776,14.173913,0.012248
2023-10-10,14.690000,14.980000,14.690000,14.960000,35746800,0.0,0.0,0.015614,0,0,...,14.602363,14.591356,14.314404,43.159902,0.070110,0.079884,14.722898,15.192227,14.253568,0.013216
2023-10-11,14.950000,15.050000,14.680000,14.770000,31793500,0.0,0.0,-0.012701,0,0,...,14.620238,14.600586,14.329165,44.079070,0.074263,0.078760,14.744101,15.180185,14.308016,0.014393
2023-10-12,14.640000,14.670000,14.280000,14.450000,38543700,0.0,0.0,-0.021666,0,0,...,14.592375,14.600456,14.336545,41.485628,0.051143,0.073237,14.727716,15.181968,14.273465,0.017965


## Feature Engineering
- **Medie mobili**: Calcoliamo le medie mobili a breve e lungo termine per il prezzo di chiusura, che sono comuni nel trading algoritmico. Ad esempio, medie mobili a 5, 10, 30 e 50 giorni.
- **RSI (Relative Strength Index)**: Questo è un indicatore di momentum che può aiutare a identificare se un'azione è in condizione di "overbought" o "oversold".
- **MACD (Moving Average Convergence Divergence)**: Un altro indicatore di momentum.
- **Bollinger Bands**: Questi sono basati su medie mobili e possono aiutare a identificare se un prezzo è relativamente alto o basso.
- **Volatilità**: Potremmo calcolare la volatilità come la deviazione standard dei rendimenti giornalieri in una finestra temporale specifica.

In [7]:
# TODO: da fare per ogni stock
# TODO: controllare gpt

# Moving Averages
merged_data['MA_5'] = merged_data['Close'].rolling(window=5).mean()
merged_data['MA_10'] = merged_data['Close'].rolling(window=10).mean()
merged_data['MA_30'] = merged_data['Close'].rolling(window=30).mean()
merged_data['MA_50'] = merged_data['Close'].rolling(window=50).mean()

# RSI
delta = merged_data['Close'].diff()
gain = (delta.where(delta > 0, 0)).fillna(0)
loss = (-delta.where(delta < 0, 0)).fillna(0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
merged_data['RSI'] = 100 - (100 / (1 + rs))

# MACD
merged_data['MACD'] = merged_data['Close'].ewm(span=12, adjust=False).mean() - merged_data['Close'].ewm(span=26, adjust=False).mean()
merged_data['Signal_Line'] = merged_data['MACD'].ewm(span=9, adjust=False).mean()

# Bollinger Bands
merged_data['Bollinger_Mid_Band'] = merged_data['Close'].rolling(window=20).mean()
merged_data['Bollinger_Upper_Band']  = merged_data['Bollinger_Mid_Band'] + 1.96*merged_data['Close'].rolling(window=20).std()
merged_data['Bollinger_Lower_Band']  = merged_data['Bollinger_Mid_Band'] - 1.96*merged_data['Close'].rolling(window=20).std()

# Volatility
merged_data['Volatility'] = merged_data['Daily_Return'].rolling(window=5).std()

to_drop_na = ['MA_5', 'MA_10', 'MA_30', 'MA_50', 'RSI', 'Volatility']

for column in to_drop_na:
    merged_data[column] = merged_data[column].fillna(0)

# Display the dataset with new features
merged_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily_Return,Target_1day,Target_5days,...,MA_10,MA_30,MA_50,RSI,MACD,Signal_Line,Bollinger_Mid_Band,Bollinger_Upper_Band,Bollinger_Lower_Band,Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-30,17.141609,17.330104,16.970250,17.267273,57979814,0.0,0.0,0.010699,0,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,0.000000
2020-07-01,17.210154,17.410073,17.050219,17.078779,41007987,0.0,0.0,-0.010916,1,1,...,0.000000,0.000000,0.000000,0.000000,-0.015037,-0.003007,,,,0.000000
2020-07-02,17.198729,17.421497,17.147322,17.181593,49947900,0.0,0.0,0.006020,1,1,...,0.000000,0.000000,0.000000,0.000000,-0.018444,-0.006095,,,,0.000000
2020-07-06,17.444342,17.478614,17.272982,17.415783,44639852,0.0,0.0,0.013630,0,0,...,0.000000,0.000000,0.000000,0.000000,-0.002222,-0.005320,,,,0.000000
2020-07-07,17.307257,17.392937,17.267274,17.318682,36422181,0.0,0.0,-0.005575,1,1,...,0.000000,0.000000,0.000000,0.000000,0.002767,-0.003703,,,,0.010587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-09,14.470000,14.750000,14.460000,14.730000,30659800,0.0,0.0,0.019377,1,0,...,14.580206,14.557477,14.300160,45.736901,0.044443,0.082328,14.684344,15.194776,14.173913,0.012248
2023-10-10,14.690000,14.980000,14.690000,14.960000,35746800,0.0,0.0,0.015614,0,0,...,14.602363,14.591356,14.314404,43.159902,0.070110,0.079884,14.722898,15.192227,14.253568,0.013216
2023-10-11,14.950000,15.050000,14.680000,14.770000,31793500,0.0,0.0,-0.012701,0,0,...,14.620238,14.600586,14.329165,44.079070,0.074263,0.078760,14.744101,15.180185,14.308016,0.014393
2023-10-12,14.640000,14.670000,14.280000,14.450000,38543700,0.0,0.0,-0.021666,0,0,...,14.592375,14.600456,14.336545,41.485628,0.051143,0.073237,14.727716,15.181968,14.273465,0.017965


In [8]:
# TODO: spiegare perche tagliamo il numero di record
merged_data = merged_data[merged_data.index >= '2020-06-30']
merged_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily_Return,Target_1day,Target_5days,...,MA_10,MA_30,MA_50,RSI,MACD,Signal_Line,Bollinger_Mid_Band,Bollinger_Upper_Band,Bollinger_Lower_Band,Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-30,17.141609,17.330104,16.970250,17.267273,57979814,0.0,0.0,0.010699,0,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,0.000000
2020-07-01,17.210154,17.410073,17.050219,17.078779,41007987,0.0,0.0,-0.010916,1,1,...,0.000000,0.000000,0.000000,0.000000,-0.015037,-0.003007,,,,0.000000
2020-07-02,17.198729,17.421497,17.147322,17.181593,49947900,0.0,0.0,0.006020,1,1,...,0.000000,0.000000,0.000000,0.000000,-0.018444,-0.006095,,,,0.000000
2020-07-06,17.444342,17.478614,17.272982,17.415783,44639852,0.0,0.0,0.013630,0,0,...,0.000000,0.000000,0.000000,0.000000,-0.002222,-0.005320,,,,0.000000
2020-07-07,17.307257,17.392937,17.267274,17.318682,36422181,0.0,0.0,-0.005575,1,1,...,0.000000,0.000000,0.000000,0.000000,0.002767,-0.003703,,,,0.010587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-09,14.470000,14.750000,14.460000,14.730000,30659800,0.0,0.0,0.019377,1,0,...,14.580206,14.557477,14.300160,45.736901,0.044443,0.082328,14.684344,15.194776,14.173913,0.012248
2023-10-10,14.690000,14.980000,14.690000,14.960000,35746800,0.0,0.0,0.015614,0,0,...,14.602363,14.591356,14.314404,43.159902,0.070110,0.079884,14.722898,15.192227,14.253568,0.013216
2023-10-11,14.950000,15.050000,14.680000,14.770000,31793500,0.0,0.0,-0.012701,0,0,...,14.620238,14.600586,14.329165,44.079070,0.074263,0.078760,14.744101,15.180185,14.308016,0.014393
2023-10-12,14.640000,14.670000,14.280000,14.450000,38543700,0.0,0.0,-0.021666,0,0,...,14.592375,14.600456,14.336545,41.485628,0.051143,0.073237,14.727716,15.181968,14.273465,0.017965


In [9]:
# da fare per ogni stock
output_filepath = "processed_nomedellostock.xlsx"
len(merged_data)
merged_data.to_excel(output_filepath)


In [10]:
# TODO list updated at 2023-10-23
# TODO: esportare in Excel il dataset finale
# TODO (opzionale) : refactor cartella Processed dentro a data 
# TODO: tagliare i dati al 30/06/2020

if not os.path.exists('./Processed'):
    os.makedirs('./Processed')
with pd.ExcelWriter(f'./Processed/{file.split("/")[2][:-5]}.xlsx', mode = "w", engine = "openpyxl") as writer:
    merged_data.to_excel(writer, sheet_name="Sheet1")
    # Append to one single dataframe
df = pd.concat([df, merged_data])