In [46]:
import pandas as pd
import glob
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression
import statsmodels.api as sm

In [2]:
csv_files = glob.glob("./*.csv")

In [3]:
csv_files

['.\\AUD_USD_H1.csv',
 '.\\EUR_GBP_H1.csv',
 '.\\EUR_USD_H1.csv',
 '.\\GBP_USD_H1.csv',
 '.\\USD_CAD_H1.csv',
 '.\\USD_CHF_H1.csv',
 '.\\USD_JPY_H1.csv']

In [4]:
dataframes = {} # Crear dict vacío
keys_list = []  # Crear lista vacía

for file in csv_files:
    key = file.split(".\\")[1]
    key = key.replace("_H1.csv", "")  # Eliminar ".csv" del final de la clave
    dataframes[key] = pd.read_csv(file)
    keys_list.append(key)  # Agregar key a la lista

print(keys_list)

['AUD_USD', 'EUR_GBP', 'EUR_USD', 'GBP_USD', 'USD_CAD', 'USD_CHF', 'USD_JPY']


In [5]:
for key in dataframes:
    print(key, dataframes[key].shape)

AUD_USD (50000, 9)
EUR_GBP (50000, 9)
EUR_USD (50000, 9)
GBP_USD (50000, 9)
USD_CAD (50000, 9)
USD_CHF (50000, 9)
USD_JPY (50000, 9)


In [6]:
dataframes

{'AUD_USD':                    time  open_bid  high_bid  low_bid  close_bid  open_ask  \
 0      2015-01-27 08 PM   0.79375   0.79524  0.79347    0.79383   0.79393   
 1      2015-01-27 09 PM   0.79382   0.79382  0.79265    0.79344   0.79405   
 2      2015-01-27 10 PM   0.79338   0.79364  0.79243    0.79250   0.79385   
 3      2015-01-27 11 PM   0.79256   0.79304  0.79184    0.79197   0.79290   
 4            2015-01-28   0.79197   0.80024  0.78991    0.79930   0.79216   
 ...                 ...       ...       ...      ...        ...       ...   
 49995  2023-02-09 09 PM   0.69321   0.69356  0.69317    0.69354   0.69334   
 49996  2023-02-09 10 PM   0.69339   0.69377  0.69299    0.69348   0.69385   
 49997  2023-02-09 11 PM   0.69346   0.69423  0.69339    0.69378   0.69393   
 49998        2023-02-10   0.69379   0.69423  0.69299    0.69419   0.69394   
 49999  2023-02-10 01 AM   0.69419   0.69452  0.69388    0.69445   0.69432   
 
        high_ask  low_ask  close_ask  
 0       0.7

In [7]:
concatenated_df = pd.concat(dataframes.values(), axis=1)

In [8]:
opens = concatenated_df['open_bid']
opens.columns = keys_list

In [9]:
opens

Unnamed: 0,AUD_USD,EUR_GBP,EUR_USD,GBP_USD,USD_CAD,USD_CHF,USD_JPY
0,0.79375,0.74758,1.13343,1.51644,1.25236,0.90455,117.668
1,0.79382,0.74725,1.13335,1.51671,1.25267,0.90435,117.679
2,0.79338,0.74764,1.13399,1.51812,1.25281,0.90724,117.447
3,0.79256,0.74743,1.13358,1.51600,1.25375,0.90299,117.507
4,0.79197,0.74852,1.13544,1.51859,1.25192,0.90129,117.469
...,...,...,...,...,...,...,...
49995,0.69321,0.88578,1.07328,1.21158,1.34602,0.92268,131.613
49996,0.69339,0.88509,1.07345,1.21147,1.34470,0.92169,131.549
49997,0.69346,0.88562,1.07392,1.21173,1.34482,0.92184,131.420
49998,0.69379,0.88628,1.07385,1.21145,1.34477,0.92205,131.509


In [10]:
returns = np.log(opens/opens.shift(1))

# Descartar la primera fila (que contiene valores NaN debido al desplazamiento)
returns = returns.iloc[1:]

# Mostrar los retornos
returns

Unnamed: 0,AUD_USD,EUR_GBP,EUR_USD,GBP_USD,USD_CAD,USD_CHF,USD_JPY
1,0.000088,-0.000442,-0.000071,0.000178,0.000248,-0.000221,0.000093
2,-0.000554,0.000522,0.000565,0.000929,0.000112,0.003191,-0.001973
3,-0.001034,-0.000281,-0.000362,-0.001397,0.000750,-0.004696,0.000511
4,-0.000745,0.001457,0.001639,0.001707,-0.001461,-0.001884,-0.000323
5,0.009138,0.000401,0.001294,0.000375,0.000926,-0.001699,0.000987
...,...,...,...,...,...,...,...
49995,-0.000980,-0.000248,-0.000373,-0.000140,0.000713,0.000434,0.000350
49996,0.000260,-0.000779,0.000158,-0.000091,-0.000981,-0.001074,-0.000486
49997,0.000101,0.000599,0.000438,0.000215,0.000089,0.000163,-0.000981
49998,0.000476,0.000745,-0.000065,-0.000231,-0.000037,0.000228,0.000677


### Escalamos los datos

In [11]:
# Crear objeto Scaler
scaler = StandardScaler()

# Escalar los datos
scaled_returns = scaler.fit_transform(returns)

# Crear un nuevo DataFrame a partir de los datos escalados
scaled_df = pd.DataFrame(scaled_returns, columns=returns.columns, index=returns.index)

# Mostrar el DataFrame escalado
scaled_df

Unnamed: 0,AUD_USD,EUR_GBP,EUR_USD,GBP_USD,USD_CAD,USD_CHF,USD_JPY
1,0.063948,-0.399456,-0.064449,0.140980,0.241184,-0.207460,0.081051
2,-0.388305,0.465416,0.524544,0.721189,0.108136,2.987716,-1.754864
3,-0.725866,-0.255267,-0.334346,-1.075911,0.733723,-4.397908,0.451681
4,-0.522206,1.305321,1.521405,1.321940,-1.433044,-1.765180,-0.289275
5,6.432706,0.356719,1.200852,0.293332,0.906337,-1.591547,0.874727
...,...,...,...,...,...,...,...
49995,-0.688127,-0.226011,-0.344546,-0.104902,0.697886,0.405729,0.308524
49996,0.184603,-0.702701,0.147886,-0.066662,-0.963040,-1.005771,-0.434020
49997,0.072930,0.534416,0.406960,0.169219,0.086064,0.152038,-0.873448
49998,0.336710,0.665797,-0.059440,-0.175034,-0.037838,0.212958,0.599353


In [24]:
y = scaled_df.iloc[:, 0:1]
y

Unnamed: 0,AUD_USD
1,0.063948
2,-0.388305
3,-0.725866
4,-0.522206
5,6.432706
...,...
49995,-0.688127
49996,0.184603
49997,0.072930
49998,0.336710


In [25]:
X = scaled_df.iloc[:, 1:]
X

Unnamed: 0,EUR_GBP,EUR_USD,GBP_USD,USD_CAD,USD_CHF,USD_JPY
1,-0.399456,-0.064449,0.140980,0.241184,-0.207460,0.081051
2,0.465416,0.524544,0.721189,0.108136,2.987716,-1.754864
3,-0.255267,-0.334346,-1.075911,0.733723,-4.397908,0.451681
4,1.305321,1.521405,1.321940,-1.433044,-1.765180,-0.289275
5,0.356719,1.200852,0.293332,0.906337,-1.591547,0.874727
...,...,...,...,...,...,...
49995,-0.226011,-0.344546,-0.104902,0.697886,0.405729,0.308524
49996,-0.702701,0.147886,-0.066662,-0.963040,-1.005771,-0.434020
49997,0.534416,0.406960,0.169219,0.086064,0.152038,-0.873448
49998,0.665797,-0.059440,-0.175034,-0.037838,0.212958,0.599353


In [39]:
# Crear objeto LinearRegression
reg = LinearRegression()

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo con los datos de entrenamiento
reg.fit(X_train, y_train)

# Evaluar el modelo en los datos de prueba
y_pred = reg.predict(X_test)

# Calcular el error cuadrático medio
mse = mean_squared_error(y_test, y_pred)

print('Error cuadrático medio:', mse, "\n")

# Mostrar los coeficientes del modelo
print("Los coeficientes son:",reg.coef_, "\n")
print("el intercepto es:",reg.intercept_)


Error cuadrático medio: 0.9820157151618112 

Los coeficientes son: [[-0.0338215   0.08624399  0.1052874  -0.0729262  -0.04969446 -0.06087178]] 

el intercepto es: [0.0017098]


In [49]:
# Obtener los coeficientes y los p-values
coef = reg.coef_
pvals = f_regression(X_train, y_train)[1]

# Imprimir los coeficientes y los p-values
for i in range(len(coef)):
    print('Variable: {} Coeficiente: {}   p-value: {}'.format(X.columns[i], coef[i], pvals[i]))

Variable: const Coeficiente: [-0.0338215   0.08624399  0.1052874  -0.0729262  -0.04969446 -0.06087178]   p-value: 4.519661194346833e-07


  y = column_or_1d(y, warn=True)


In [47]:
X = sm.add_constant(X)
    
model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                AUD_USD   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     359.6
Date:                Wed, 08 Mar 2023   Prob (F-statistic):               0.00
Time:                        21:36:21   Log-Likelihood:                -69889.
No. Observations:               49999   AIC:                         1.398e+05
Df Residuals:                   49992   BIC:                         1.399e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       2.575e-19      0.004   5.88e-17      1.0

##### El modelo de regresión lineal ordinaria (OLS) utilizado tiene una R-cuadrada ajustada de 0.041, lo que indica que el modelo explica solo el 4.1% de la variación en la variable dependiente AUD_USD. Los coeficientes de las variables predictoras EUR_GBP, EUR_USD, GBP_USD, USD_CAD, USD_CHF y USD_JPY son significativamente diferentes de cero con valores de p menores que 0.05. Los coeficientes positivos de EUR_USD y GBP_USD indican que un aumento en estas variables se relaciona con un aumento en la variable dependiente AUD_USD, mientras que los coeficientes negativos de EUR_GBP, USD_CAD, USD_CHF y USD_JPY indican que un aumento en estas variables se relaciona con una disminución en AUD_USD. El modelo es significativo según la prueba F con un valor de p cercano a cero. La prueba Omnibus muestra que hay una desviación significativa de la normalidad en los residuos del modelo, pero la prueba Durbin-Watson indica que no hay autocorrelación en los residuos.