# Baseline Model: Linear Regression

### Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import warnings
from functools import reduce

from random import gauss as gs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from statsmodels.stats.outliers_influence import variance_inflation_factor



import itertools
#from pmdarima import auto_arima

#statsmodels
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX

%matplotlib inline
warnings.filterwarnings("ignore")

### Feature Descriptions
|Feature Name|Description|
|------------|-----------|
|active_addresses|The number of unique addresses that were active in the network either as a sender or receiver. Only addresses that were active in successful transactions are counted.|
|circulating_supply|The total amount of all coins ever created/issued, i.e. the circulating supply.|
|exchange_balance|The total amount of coins held on exchange addresses. Note that exchange metrics are based on our labeled data of exchange addresses that we constantly keep updating, as well as data science techniques and statistical information that changes over time. Therefore these metrics are mutable – the data is stable, but especially most recent data points are subject to slight fluctuations as time progresses.|
|exchange_deposits|The total count of transfers to exchange addresses, i.e. the number of on-chain deposits to exchanges. Note that exchange metrics are based on our labeled data of exchange addresses that we constantly keep updating, as well as data science techniques and statistical information that changes over time. Therefore these metrics are mutable – the data is stable, but especially most recent data points are subject to slight fluctuations as time progresses.|
|exchange_withdrawals|The total count of transfers from exchange addresses, i.e. the number of on-chain withdrawals from exchanges. Note that exchange metrics are based on our labeled data of exchange addresses that we constantly keep updating, as well as data science techniques and statistical information that changes over time. Therefore these metrics are mutable – the data is stable, but especially most recent data points are subject to slight fluctuations as time progresses.|
|market_cap|The market capitalization (or network value) is defined as the product of the current supply by the current USD price.|
|price|The asset's closing price in USD.|
|sopr|The Spent Output Profit Ratio (SOPR) is computed by dividing the realized value (in USD) divided by the value at creation (USD) of a spent output. Or simply: price sold / price paid. This metric was created by Renato Shirakashi. For a detailed commentary see this post|
|transaction_count|The total amount of transactions. Only successful transactions are counted.|
|transfer_count|The total amount of transfers. One transaction can trigger one or more transfers. Only successful, non-zero transfers are counted.|

In [2]:
df = pd.read_excel('data/ethereum_data.xlsx')

In [3]:
df.set_index('date', inplace = True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1827 entries, 2015-08-08 to 2020-08-11
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   active_addresses      1827 non-null   int64  
 1   circulating_supply    1827 non-null   float64
 2   exchange_balance      1827 non-null   float64
 3   exchange_deposits     1827 non-null   int64  
 4   exchange_withdrawals  1827 non-null   int64  
 5   market_cap            1827 non-null   float64
 6   price                 1827 non-null   float64
 7   sopr                  1827 non-null   float64
 8   transaction_count     1827 non-null   int64  
 9   transfer_count        1827 non-null   int64  
dtypes: float64(5), int64(5)
memory usage: 157.0 KB


In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler ()

scaled_df = scaler.fit_transform(df)
print(scaled_df)

[[9.67071902e-05 0.00000000e+00 0.00000000e+00 ... 4.88617387e-02
  7.73758201e-04 2.73712687e-04]
 [0.00000000e+00 6.82289474e-04 2.07980583e-02 ... 2.13471515e-02
  0.00000000e+00 0.00000000e+00]
 [3.72813226e-04 1.38062211e-03 5.39909678e-02 ... 0.00000000e+00
  5.36082474e-04 2.48910243e-04]
 ...
 [5.45313626e-01 9.99316549e-01 9.54212211e-01 ... 3.40106327e-01
  8.81382568e-01 4.15623414e-01]
 [5.72517779e-01 9.99658065e-01 9.47039546e-01 ... 3.43872741e-01
  9.25728210e-01 4.47526443e-01]
 [5.63989326e-01 1.00000000e+00 9.46644068e-01 ... 3.37968813e-01
  9.02184817e-01 4.38561246e-01]]


In [8]:
type(scaled_df)

numpy.ndarray

# Model Building

### Baseline Model

In [7]:
# Drop the new 'age' column and our target column 'price' for the independent features
X = scaled_df.drop('price', axis = 1)

# Set our dependent variable as price
y = scaled_df.price
  
# Split up our independent and dependent variables into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_validate

In [None]:
# Trains our model on our baseline values
lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)

# predicting test prices based on the X_test
y_predict_dummy_mean = lm_dummy_mean.predict(X_test)

# Find the error of our dummy model
rmse_lr0 = mean_squared_error(y_test, y_predict_dummy_mean, squared=False)
print(r2_score(y_test, y_predict_dummy_mean), rmse_lr0)

In [None]:
# Cross validate our model and find the mean of the training scores
scores_simple_1 = cross_validate(
                    lr, X_train, y_train, cv=5, 
                    return_train_score=True
)
print(scores_simple_1)

In [None]:
simple_1_mean = np.mean(scores_simple_1['train_score'])
print(simple_1_mean)
simple_1_mean_test = np.mean(scores_simple_1['test_score'])

In [None]:
simple_1_mean

In [None]:
# Checking the QQ Plot to understand the distribution of residuals
residuals1 = (y_test - pred_lr1)
sm.graphics.qqplot(residuals1, dist=stats.norm, line="45", fit=True);

In [None]:
# Fits the model to our training dataset
lr.fit(X_train, y_train)

### Model Building - Basic Model (Linear Regression)

In [None]:
# Initialize an empty regression model
lr = LinearRegression()

# Fits the model to our training dataset
lr.fit(X_train, y_train)

# Predict price with the trained model
pred_lr2 = lr.predict(X_test)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_df = scaler.fit_transform(df)