# Data Preprocessing

In [11]:
import pandas as pd

In [12]:
# TODO: da rifare per ogni stock
sheet_names = [
    'Info',
    'Historical',
    'Income Statement',
    'Quarterly Income Statement',
    'Cashflow',
    'Institutional Holders',
    'Mutual Fund Holders',
    'Major Holders'
]
#riempire stocks di tutti i vari codici, fare la retrive di tutti i file e buttarli in df_stock per poi poter lavorare su tutti i dati
stocks = ['./data/MSFT.xlsx']
df_stock = pd.ExcelFile(stocks[0])


## Integrazione dei dati finanziarii
Colonne aggiunte:
- **Daily_Return**: rendimento giornaliero.
- **Target_1day**: indica se il prezzo di chiusura del giorno successivo sarà superiore (1) o inferiore (0) rispetto al prezzo di chiusura del giorno corrente.
- **Target_5days**: indica se il prezzo di chiusura a 5 giorni nel futuro sarà superiore (1) o inferiore (0) rispetto al prezzo di chiusura del giorno corrente.
- **Target_30days**: indica se il prezzo di chiusura a 30 giorni nel futuro sarà superiore (1) o inferiore (0) rispetto al prezzo di chiusura del giorno corrente.

Integrato i vari sheet  "Income Statement", "Quarterly Income Statement" e "Cashflow" in un singolo excel. NB: Dato che questi fogli contengono dati finanziari annuali o trimestrali un approccio comune è portare avanti l'ultimo valore noto per ogni giorno fino a quando non si dispone di un nuovo valore. Per alcunii anni finanziari sarà Nan perché non li abbiamo.

In [17]:
# TODO: da fare per ogni stock

# prevent false postive warnings, reference_ https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None # default='warn'

# Loading the 'Historical' data stock
historical_data = df_stock.parse('Historical')

# Renaming and setting the Date column
historical_data.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
historical_data['Date'] = pd.to_datetime(historical_data['Date'])
historical_data.set_index('Date', inplace=True)

# Calculate daily return
historical_data['Daily_Return'] = historical_data['Close'].pct_change()

# Create target variables for next day, next 5 days and next 30 days
historical_data['Target_1day'] = (historical_data['Close'].shift(-1) > historical_data['Close']).astype(int)
historical_data['Target_5days'] = (historical_data['Close'].shift(-5) > historical_data['Close']).astype(int)
historical_data['Target_30days'] = (historical_data['Close'].shift(-30) > historical_data['Close']).astype(int)

# Drop rows with NaN values (will be present due to the shifting for target creation)
historical_data = historical_data.dropna()

# Loading the 'Income Statement' data for XOM
income_statement = df_stock.parse('Income Statement')

# Transposing the data for easier integration
income_statement = income_statement.set_index('Unnamed: 0').transpose()
income_statement.index = pd.to_datetime(income_statement.index)

# Selecting some of the key financial metrics (you can add or remove based on relevance)
selected_metrics = [
    'Normalized EBITDA',
    'Total Unusual Items',
    'Total Unusual Items Excluding Goodwill'
]

income_statement = income_statement[selected_metrics]

# Merging the income statement data with the historical data
merged_data = historical_data.join(income_statement, how='left')

# Forward filling the NaN values
merged_data[selected_metrics] = merged_data[selected_metrics].fillna(method='ffill')

# Loading the 'Quarterly Income Statement' data for XOM
quarterly_income_statement = df_stock.parse('Quarterly Income Statement')

# Transposing the data for easier integration
quarterly_income_statement = quarterly_income_statement.set_index('Unnamed: 0').transpose()
quarterly_income_statement.index = pd.to_datetime(quarterly_income_statement.index)

# Selecting some of the key financial metrics (you can add or remove based on relevance)
selected_metrics_quarterly = [
    'Normalized EBITDA',
    'Total Unusual Items',
    'Total Unusual Items Excluding Goodwill'
]

quarterly_income_statement = quarterly_income_statement[selected_metrics_quarterly]

# Merging the quarterly income statement data with the existing dataframe
merged_data = merged_data.join(quarterly_income_statement, how='left', rsuffix='_quarterly')

# Forward filling the NaN values
merged_data[selected_metrics_quarterly] = merged_data[selected_metrics_quarterly].fillna(method='ffill')

# Loading the 'Cashflow' data for XOM
cashflow = df_stock.parse('Cashflow')

# Transposing the data for easier integration
cashflow = cashflow.set_index('Unnamed: 0').transpose()
cashflow.index = pd.to_datetime(cashflow.index)

# Selecting some of the key cashflow metrics (you can add or remove based on relevance)
selected_cashflow_metrics = [
    'Operating Cash Flow',
    'Capital Expenditure',
    'Free Cash Flow'
]

cashflow = cashflow[selected_cashflow_metrics]

# Merging the cashflow data with the existing dataframe
merged_data = merged_data.join(cashflow, how='left', rsuffix='_cashflow')

# Forward filling the NaN values
merged_data[selected_cashflow_metrics] = merged_data[selected_cashflow_metrics].fillna(method='ffill')

# Display the updated dataframe with integrated cashflow metrics
merged_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily_Return,Target_1day,Target_5days,Target_30days,Normalized EBITDA,Total Unusual Items,Total Unusual Items Excluding Goodwill,Normalized EBITDA_quarterly,Total Unusual Items_quarterly,Total Unusual Items Excluding Goodwill_quarterly,Operating Cash Flow,Capital Expenditure,Free Cash Flow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1986-03-14,0.060396,0.063632,0.060396,0.062553,308160000,0.0,0.0,0.035712,1,0,1,,,,,,,,,
1986-03-17,0.062553,0.064172,0.062553,0.063632,133171200,0.0,0.0,0.017250,0,0,1,,,,,,,,,
1986-03-18,0.063632,0.064172,0.061475,0.062014,67766400,0.0,0.0,-0.025432,0,0,1,,,,,,,,,
1986-03-19,0.062014,0.062553,0.060396,0.060936,47894400,0.0,0.0,-0.017390,0,0,1,,,,,,,,,
1986-03-20,0.060936,0.060936,0.058779,0.059318,58435200,0.0,0.0,-0.026547,0,1,1,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-09,324.750000,330.299988,323.179993,329.820007,19891200,0.0,0.0,0.007823,0,0,0,1.023840e+11,-15000000.0,-15000000.0,,,,8.758200e+10,-2.810700e+10,5.947500e+10
2023-10-10,330.959991,331.100006,327.670013,328.390015,20557100,0.0,0.0,-0.004336,1,0,0,1.023840e+11,-15000000.0,-15000000.0,,,,8.758200e+10,-2.810700e+10,5.947500e+10
2023-10-11,331.209991,332.820007,329.140015,332.420013,20063200,0.0,0.0,0.012272,0,0,0,1.023840e+11,-15000000.0,-15000000.0,,,,8.758200e+10,-2.810700e+10,5.947500e+10
2023-10-12,330.570007,333.630005,328.720001,331.160004,19313100,0.0,0.0,-0.003790,0,0,0,1.023840e+11,-15000000.0,-15000000.0,,,,8.758200e+10,-2.810700e+10,5.947500e+10


## Feature Engineering
- **Medie mobili**: Calcoliamo le medie mobili a breve e lungo termine per il prezzo di chiusura, che sono comuni nel trading algoritmico. Ad esempio, medie mobili a 5, 10, 30 e 50 giorni.
- **RSI (Relative Strength Index)**: Questo è un indicatore di momentum che può aiutare a identificare se un'azione è in condizione di "overbought" o "oversold".
- **MACD (Moving Average Convergence Divergence)**: Un altro indicatore di momentum.
- **Bollinger Bands**: Questi sono basati su medie mobili e possono aiutare a identificare se un prezzo è relativamente alto o basso.
- **Volatilità**: Potremmo calcolare la volatilità come la deviazione standard dei rendimenti giornalieri in una finestra temporale specifica.

In [19]:
# TODO: da fare per ogni stock
# TODO: controllare gpt

# Moving Averages
merged_data['MA_5'] = merged_data['Close'].rolling(window=5).mean()
merged_data['MA_10'] = merged_data['Close'].rolling(window=10).mean()
merged_data['MA_30'] = merged_data['Close'].rolling(window=30).mean()
merged_data['MA_50'] = merged_data['Close'].rolling(window=50).mean()

# RSI
delta = merged_data['Close'].diff()
gain = (delta.where(delta > 0, 0)).fillna(0)
loss = (-delta.where(delta < 0, 0)).fillna(0)
avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()
rs = avg_gain / avg_loss
merged_data['RSI'] = 100 - (100 / (1 + rs))

# MACD
merged_data['MACD'] = merged_data['Close'].ewm(span=12, adjust=False).mean() - merged_data['Close'].ewm(span=26, adjust=False).mean()
merged_data['Signal_Line'] = merged_data['MACD'].ewm(span=9, adjust=False).mean()

# Bollinger Bands
merged_data['Bollinger_Mid_Band'] = merged_data['Close'].rolling(window=20).mean()
merged_data['Bollinger_Upper_Band']  = merged_data['Bollinger_Mid_Band'] + 1.96*merged_data['Close'].rolling(window=20).std()
merged_data['Bollinger_Lower_Band']  = merged_data['Bollinger_Mid_Band'] - 1.96*merged_data['Close'].rolling(window=20).std()

# Volatility
merged_data['Volatility'] = merged_data['Daily_Return'].rolling(window=5).std()

to_drop_na = ['MA_5', 'MA_10', 'MA_30', 'MA_50', 'RSI', 'Volatility']

for column in to_drop_na:
    merged_data[column] = merged_data[column].fillna(0)

# Display the dataset with new features
merged_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Daily_Return,Target_1day,Target_5days,...,MA_10,MA_30,MA_50,RSI,MACD,Signal_Line,Bollinger_Mid_Band,Bollinger_Upper_Band,Bollinger_Lower_Band,Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1986-03-14,0.060396,0.063632,0.060396,0.062553,308160000,0.0,0.0,0.035712,1,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,,,0.000000
1986-03-17,0.062553,0.064172,0.062553,0.063632,133171200,0.0,0.0,0.017250,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000086,0.000017,,,,0.000000
1986-03-18,0.063632,0.064172,0.061475,0.062014,67766400,0.0,0.0,-0.025432,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000023,0.000018,,,,0.000000
1986-03-19,0.062014,0.062553,0.060396,0.060936,47894400,0.0,0.0,-0.017390,0,0,...,0.000000,0.000000,0.000000,0.000000,-0.000112,-0.000008,,,,0.000000
1986-03-20,0.060936,0.060936,0.058779,0.059318,58435200,0.0,0.0,-0.026547,0,1,...,0.000000,0.000000,0.000000,0.000000,-0.000346,-0.000075,,,,0.028165
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-10-09,324.750000,330.299988,323.179993,329.820007,19891200,0.0,0.0,0.007823,0,0,...,318.491003,325.336335,324.845659,51.123502,-1.361400,-2.545405,322.711002,338.584228,306.837775,0.019647
2023-10-10,330.959991,331.100006,327.670013,328.390015,20557100,0.0,0.0,-0.004336,1,0,...,320.116003,325.492669,324.709253,58.351646,-0.729112,-2.182147,322.542003,338.091020,306.992986,0.011862
2023-10-11,331.209991,332.820007,329.140015,332.420013,20063200,0.0,0.0,0.012272,0,0,...,322.079004,325.626336,324.645065,63.313404,0.096061,-1.726505,322.360004,337.340635,307.379372,0.011130
2023-10-12,330.570007,333.630005,328.720001,331.160004,19313100,0.0,0.0,-0.003790,0,0,...,323.831003,325.705335,324.732104,65.005317,0.640955,-1.253013,321.983003,335.603516,308.362490,0.012118


In [15]:
# da fare per ogni stock
output_filepath = "processed_nomedellostock.xlsx"
len(merged_data)
merged_data.to_excel(output_filepath)
