In [1]:
import datetime as dt
from datetime import date
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import math
from tqdm import tqdm

# Data Input, EDA & Preprocessing

## Individual Data Input

In [2]:
price_vol = pd.read_csv('Price & Volume BTC.csv')
tweets = pd.read_csv('Tweets.csv')
funding_rates_1600 = pd.read_csv('Funding Rates BTCUSDT 1600.csv')
funding_rates_0800 = pd.read_csv('Funding Rates BTCUSDT 0800.csv')
funding_rates_0000 = pd.read_csv('Funding Rates BTCUSDT 0000.csv')
google = pd.read_csv('Google.csv')
transactions = pd.read_csv('Transactions.csv')
unique_addresses = pd.read_csv('Unique Addresses.csv')
active_addresses = pd.read_csv('Active Addresses.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Price & Volume BTC.csv'

## Data Compilation

In [None]:
# get daily ave value for twitter attributes
tweets = tweets.groupby(by="Time").mean()
tweets = tweets.reset_index()

df = pd.merge(price_vol, funding_rates_0000, on=['Time'])
df = pd.merge(df, funding_rates_0800, on=['Time'])
df = pd.merge(df, funding_rates_1600, on=['Time'])
df = pd.merge(df, google, on=['Time'])
df = pd.merge(df, active_addresses, on=['Time'])
df = pd.merge(df, unique_addresses, on=['Time'])
df = pd.merge(df, transactions, on=['Time'])
df = pd.merge(df, tweets, how="outer")
df.head()

In [None]:
print("Type of variables: ", "\n", df.dtypes)

# Preprocessing

## Data Type Transformation

In [None]:
# Transform object type of "Funding Rate 0000", "Funding Rate 0800" and " Funding Rate 1600" to numerical type
df['Funding Rate 0000'] = df['Funding Rate 0000'].map(lambda x: float('nan') if pd.isnull(x) else float(x.replace('%','')))
df['Funding Rate 0800'] = df['Funding Rate 0800'].map(lambda x: float('nan') if pd.isnull(x) else float(x.replace('%','')))
df['Funding Rate 1600'] = df['Funding Rate 1600'].map(lambda x: float('nan') if pd.isnull(x) else float(x.replace('%','')))

df['Month'] = df['Time'].str[:7]
df['Time2'] = df['Time'].map(lambda x: int('nan') if pd.isnull(x) else int(x.replace("-", "")))
df['Month'] = df['Month'].map(lambda x: int('nan') if pd.isnull(x) else int(x.replace("-", "")))
df['FundingRate'] = (df['Funding Rate 1600'] + df['Funding Rate 0800'] + df['Funding Rate 0000'])/3 
df = df.sort_values(by = 'Time', ascending=True).reset_index(drop=True)
df.head(10)


## Handle Missing Data

In [None]:
# Count missing value
print(df.isna().sum())

### Monthly Sentiment Analysis

In [None]:
tweets.insert(1, "Month", "NaN")

tweets['Month'] = tweets['Time'].str[:7]
tweets['Month'] = tweets['Month'].map(lambda x: int('nan') if pd.isnull(x) else int(x.replace("-", "")))
tweets.head(10)

In [None]:
tweets = tweets.groupby(by="Month").mean()
tweets = tweets.reset_index()

In [None]:
tweets.head(10)

In [None]:
overall_df = pd.DataFrame(columns = ['Time', 'Month', 'Time2', 'Compound', \
                                 'Close', 'Volume', 'FundingRate', \
                             'Bitcoin_SVI', 'Cryptocurrency_SVI',  'No. of Active Addresses', \
                                 'No. of Unique Addresses', 'Transactions'])

for i, row in df.iterrows():
    if (pd.isna(df.loc[i].at["Favorites"])):
        data = tweets[tweets['Month'] == df.loc[i].at["Month"]]
        new_row = {'Time': df.loc[i].at["Time"],'Time2': df.loc[i].at["Time2"], 'Month': df.loc[i].at["Month"],  'Compound': data.Compound.values[0], \
                    
                 'Close': df.loc[i].at["Close"], 'Volume': df.loc[i].at["Volume"], 'FundingRate': df.loc[i].at["FundingRate"], \
                  'Bitcoin_SVI': df.loc[i].at["Bitcoin_SVI"], \
                   'Cryptocurrency_SVI': df.loc[i].at["Cryptocurrency_SVI"], 'No. of Active Addresses': df.loc[i].at["No. of Active Addresses"], 'No. of Unique Addresses': df.loc[i].at["No. of Unique Addresses"], \
                   'Transactions': df.loc[i].at["Transactions"]}
        overall_df = overall_df.append(new_row, ignore_index=True)
    else:
        new_row = {'Time': df.loc[i].at["Time"], 'Time2': df.loc[i].at["Time2"],'Month': df.loc[i].at["Month"], 'Compound': df.loc[i].at["Compound"], \
                  
                 'Close': df.loc[i].at["Close"],'Volume': df.loc[i].at["Volume"], 'FundingRate': df.loc[i].at["FundingRate"], \
                  'Bitcoin_SVI': df.loc[i].at["Bitcoin_SVI"], \
                   'Cryptocurrency_SVI': df.loc[i].at["Cryptocurrency_SVI"], 'No. of Active Addresses': df.loc[i].at["No. of Active Addresses"], 'No. of Unique Addresses': df.loc[i].at["No. of Unique Addresses"], \
                   'Transactions': df.loc[i].at["Transactions"]}
        overall_df = overall_df.append(new_row, ignore_index=True)


In [None]:
overall_df = overall_df.drop(['Month'], axis = 1)
overall_df.head(10)

In [None]:
from datetime import datetime

for i in range(len(overall_df)): 
    overall_df.loc[i, 'Time'] = datetime.strptime(overall_df.loc[i, 'Time'], '%Y-%m-%d')


In [None]:
overall_df.drop(columns=['Time2'], inplace=True)
overall_df.head(10)

In [None]:
test_size = 0.2                                                # proportion of dataset to be used as test set
cv_size = 0.2                                                   # proportion of dataset to be used as cross-validation set
N = 5                                                              # for feature at day t, we use lags from t-1, t-2, ..., t-N as features


In [None]:
def get_mov_avg_std(df, col, N):
    """
    Given a dataframe, get mean and std dev at timestep t using values from t-1, t-2, ..., t-N.
    Inputs
        df         : dataframe. Can be of any length.
        col        : name of the column you want to calculate mean and std dev
        N          : get mean and std dev at timestep t using values from t-1, t-2, ..., t-N
    Outputs
        df_out     : same as df but with additional column containing mean and std dev
    """
    mean_list = df[col].rolling(window = N, min_periods=1).mean() 
    std_list = df[col].rolling(window = N, min_periods=1).std()   
    
    # Add one timestep to the predictions
    mean_list = np.concatenate((np.array([np.nan]), np.array(mean_list[:-1])))
    std_list = np.concatenate((np.array([np.nan]), np.array(std_list[:-1])))
    
    # Append mean_list to df
    df_out = df.copy()
    df_out[col + '_mean'] = mean_list
    df_out[col + '_std'] = std_list
    
    return df_out

In [None]:
fig = plt.figure(figsize=(20,10))
ax  = overall_df.plot(x = 'Time',y = 'Close', style = 'b-')
plt.xlabel('Date')
plt.xticks(fontsize=6 ) 
plt.ylabel('Price of BTC (USD)')
plt.title("Price of BTC From 4th March 2020 to 3rd March 2022")

In [None]:
vol = pd.DataFrame() 
vol['Date'] = overall_df['Time']
vol['price'] = overall_df['Close']
vol["7d_vol"] = overall_df["Close"].pct_change().rolling(7).std()
ax  = vol.plot(x = 'Date',y = '7d_vol', style = 'b-')
plt.xlabel('Date')
plt.xticks(fontsize=6 ) 
plt.ylabel('Weekly Volatility of BTC')
plt.title("Weekly Volatility of BTC From 4th March 2020 to 3rd March 2022")
plt.savefig('Weekly_volatility.pdf')  

In [None]:
print(overall_df.isna().sum())
overall_df1 = overall_df.copy()

In [None]:
lag_cols = list(overall_df.columns)[1:]
merging_keys = ['day']

In [None]:
lag_cols

In [None]:
overall_df['day'] = [x for x in list(range(len(overall_df)))]
overall_df

# Identifying Correlations 

In [None]:
cols = list(overall_df.columns)[1:-1]
corr = overall_df[cols].astype(float).corr()
plt.figure(figsize=(20,8))
plt.figure(figsize=(20,8))
#returns an array of given shape and type as given array, with zeros
mask = np.zeros_like(corr)

# Return the indices for the upper-triangle of arr and makes it true
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr, cmap='RdYlGn', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=1.5,annot = True)
plt.yticks(rotation=0) 
plt.xticks(rotation=90) 
plt.show()


# Outlier Detection

In [None]:
feature_Boxplot1 = 'FundingRate'
feature_Boxplot2 = 'No. of Active Addresses'
feature_Boxplot3 = 'No. of Unique Addresses'
feature_Boxplot4 = 'Transactions'


In [None]:
ax1 = sns.boxplot(data=overall_df[feature_Boxplot1], orient="h", palette="Set2")


In [None]:
ax2 = sns.boxplot(data=overall_df[feature_Boxplot2], orient="h", palette="Set2")


In [None]:
ax3 = sns.boxplot(data=overall_df[feature_Boxplot3], orient="h", palette="Set2")


In [None]:
ax4 = sns.boxplot(data=overall_df[feature_Boxplot4], orient="h", palette="Set2")

# Winsorization

In [None]:
sns.boxplot(overall_df['FundingRate'])

In [None]:
# Before Winsorization
overall_df['FundingRate'].describe()

In [None]:
# Finding upper and lower limit
upper_limit = overall_df['FundingRate'].quantile(0.88)
lower_limit = overall_df['FundingRate'].quantile(0.03)

print("Highest Allowed: ", upper_limit)
print("Lowest Allowed: ", lower_limit)

In [None]:
# Apply Trimming
new_df = overall_df[(overall_df['FundingRate'] >= upper_limit) | (overall_df['FundingRate'] <= lower_limit)]

In [None]:
sns.boxplot(new_df['FundingRate'])

In [None]:
# Apply Capping
overall_df['FundingRate'] = np.where(overall_df['FundingRate'] >= upper_limit,
        upper_limit,
        np.where(overall_df['FundingRate'] <= lower_limit,
        lower_limit,
        overall_df['FundingRate']))

# After Winsorization
overall_df['FundingRate'].describe()

In [None]:
sns.boxplot(overall_df['FundingRate'])

# Creating Lagging Features (Up to 5 Days)

In [None]:
from tqdm.notebook import tqdm_notebook

shift_range = [x+1 for x in range(N)]

for shift in tqdm_notebook(shift_range):
    
    train_shift = overall_df[merging_keys + lag_cols].copy()
    train_shift['day'] = train_shift['day'] + shift
    foo = lambda x: '{}_lag_{}'.format(x, shift) if x in lag_cols else x
    train_shift = train_shift.rename(columns=foo)
    print(train_shift)
    overall_df = pd.merge(overall_df, train_shift, on=merging_keys, how='left') 
    
    
del train_shift

overall_df = overall_df[N:]
    
overall_df.head()

In [None]:
lag_cols

In [None]:
overall_df = get_mov_avg_std(overall_df, 'Close',N)
overall_df.columns

In [None]:
overall_df.columns

# Train-Test Split


In [None]:
#80-20 split - train-test 

train_size = 0.6 
test_size = 0.2                                              
val_size = 0.2                                                 
N = 5 #For time lag (to get previous days of data)  

In [None]:
num_val = int(val_size*len(overall_df))
num_test = int(test_size*len(overall_df))
num_train = len(overall_df) - num_val - num_test
print("num_train = " + str(num_train))
print("num_val = " + str(num_val))
print("num_test = " + str(num_test))

# Split into train, cv, and test
train = overall_df[:num_train]
val = overall_df[num_train:num_train+num_val]
train_val = overall_df[:num_train+num_val]
test = overall_df[num_train+num_val:]
print("train.shape = " + str(train.shape))
print("cv.shape = " + str(val.shape))
print("train_cv.shape = " + str(train_val.shape))
print("test.shape = " + str(test.shape))

In [None]:
train_time = train['Time']
test_time = test['Time']
val_time = val['Time']
train_val_time = train_val['Time']

train = train.drop(columns = ['Time'])
test = test.drop(columns = ['Time'])
val = val.drop(columns = ['Time'])
train_val = train_val.drop(columns = ['Time'])

# Select features, the target output is'Close'
feature_pool = train.columns
#Note : Close is a feature as well 

output = 'Close'

### NaN Values

In [None]:
print(train.isna().sum())
print(test.isna().sum())
print(val.isna().sum())
print(train_val.isna().sum())

In [None]:
KNN_miss_filling = KNNImputer(n_neighbors=5).fit(train)
train = pd.DataFrame(KNN_miss_filling.transform(train))

KNN_miss_filling = KNNImputer(n_neighbors=5).fit(test)
test = pd.DataFrame(KNN_miss_filling.transform(test))

KNN_miss_filling = KNNImputer(n_neighbors=5).fit(train_val)
train_val = pd.DataFrame(KNN_miss_filling.transform(train_val))

KNN_miss_filling = KNNImputer(n_neighbors=5).fit(val)
val = pd.DataFrame(KNN_miss_filling.transform(val))


In [None]:
train.columns = feature_pool
test.columns = feature_pool
train_val.columns = feature_pool
val.columns = feature_pool

## Scale the train, dev and test set

In [None]:
# Standardized features:
standardized_features = ['Compound','Volume', 'Close', 'Close_mean', 'Close_std', 'Transactions', 'Cryptocurrency_SVI', 'Bitcoin_SVI','No. of Active Addresses', 'No. of Unique Addresses']     
for i in range(len(lag_cols)): 
    for j in range(1, N+1):
        standardized_features.append(lag_cols[i]+"_lag_"+ str(j))
non_standardized_features = list(set(train.columns)-set(standardized_features))
non_standardized_features



In [None]:
# Get the scaler based on train set
scaler = preprocessing.MinMaxScaler().fit(train[standardized_features])

train_std=pd.DataFrame(scaler.fit_transform(train[standardized_features]))  # transform() return 'numpy.ndarray', not 'DataFrame' or 'Series'
train_nstd=pd.DataFrame(train[non_standardized_features])


train_std.columns = train_std.columns.map(lambda x: standardized_features[x])
train_std.reset_index(drop=True, inplace=True)
train_nstd.reset_index(drop=True, inplace=True)
train_scaled = pd.concat([train_std,train_nstd], sort=False,axis=1)

# Get the scaler based on cv set
scaler.val = preprocessing.MinMaxScaler().fit(val[standardized_features])


val_std=pd.DataFrame(scaler.transform(val[standardized_features]))  # transform() return 'numpy.ndarray', not 'DataFrame' or 'Series'
val_nstd=pd.DataFrame(val[non_standardized_features])
val_std.columns = val_std.columns.map(lambda x: standardized_features[x])
val_std.reset_index(drop=True, inplace=True)
val_nstd.reset_index(drop=True, inplace=True)
val_scaled = pd.concat([val_std,val_nstd], sort=False,axis=1)


scaler_trainval = preprocessing.MinMaxScaler().fit(train_val[standardized_features])


train_val_std=pd.DataFrame(scaler.transform(train_val[standardized_features]))  # transform() return 'numpy.ndarray', not 'DataFrame' or 'Series'
train_val_nstd=pd.DataFrame(train_val[non_standardized_features])
train_val_std.columns = train_val_std.columns.map(lambda x: standardized_features[x])
train_val_std.reset_index(drop=True, inplace=True)
train_val_nstd.reset_index(drop=True, inplace=True)
train_val_scaled = pd.concat([train_val_std,train_val_nstd], sort=False,axis=1)



scaler_test = preprocessing.MinMaxScaler().fit(test[standardized_features])


test_std=pd.DataFrame(scaler.transform(test[standardized_features]))  # transform() return 'numpy.ndarray', not 'DataFrame' or 'Series'
test_nstd=pd.DataFrame(test[non_standardized_features])
test_std.columns = test_std.columns.map(lambda x: standardized_features[x])
test_std.reset_index(drop=True, inplace=True)
test_nstd.reset_index(drop=True, inplace=True)
test_scaled = pd.concat([test_std,test_nstd], sort=False,axis=1)


In [None]:
# Update the order within feature pool
feature_pool = list(train.columns)[10:]

feature_pool

In [None]:
output

In [None]:
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)
train_val.to_csv('train_val.csv', index = False)
val.to_csv('val.csv', index = False)

In [None]:
feature_pool = feature_pool[:-2]

# X and Y Datasets

In [None]:
X_train = train[feature_pool]
y_train = train[output]
X_val = val[feature_pool]
y_val = val[output]
X_train_val = train_val[feature_pool]
y_train_val = train_val[output]
X_test = test[feature_pool]
y_test = test[output]
print("X_train.shape = " + str(X_train.shape))
print("y_train.shape = " + str(y_train.shape))
print("X_val.shape = " + str(X_val.shape))
print("y_val.shape = " + str(y_val.shape))
print("X_train_val.shape = " + str(X_train_val.shape))
print("y_train_val.shape = " + str(y_train_val.shape))
print("X_sample.shape = " + str(X_test.shape))
print("y_sample.shape = " + str(y_test.shape))

In [None]:
X_train_scaled = train_scaled[feature_pool]
y_train_scaled = train_scaled['Close']
X_val_scaled = val_scaled[feature_pool]
y_val_scaled= val_scaled['Close']
X_train_val_scaled = train_val_scaled[feature_pool]
y_train_val_scaled = train_val_scaled['Close']
X_test_scaled = test_scaled[feature_pool]
y_test_scaled = test_scaled['Close']
print("X_train_scaled.shape = " + str(X_train_scaled.shape))
print("y_train_scaled.shape = " + str(y_train_scaled.shape))
print("X_val_scaled.shape = " + str(X_val_scaled.shape))
print("y_val_scaled.shape = " + str(y_val_scaled.shape))
print("X_train_val_scaled.shape = " + str(X_train_val_scaled.shape))
print("y_train_val_scaled.shape = " + str(y_train_val_scaled.shape))
print("X_test_scaled.shape = " + str(X_test_scaled.shape))
print("y_test_scaled.shape = " + str(y_test_scaled.shape))

In [None]:
X_train.to_csv('X_train.csv', index = False)
y_train.to_csv('y_train.csv', index = False)
X_val.to_csv('X_val.csv', index = False)
y_val.to_csv('y_val.csv', index = False)
X_train_val.to_csv('X_train_val.csv', index = False)
y_train_val.to_csv('y_train_val.csv', index = False)
X_test.to_csv('X_test.csv', index = False)
y_test.to_csv('y_test.csv', index = False)

X_train_scaled.to_csv('X_train_scaled.csv', index = False)
y_train_scaled.to_csv('y_train_scaled.csv', index = False)
X_val_scaled.to_csv('X_val_scaled.csv', index = False)
y_val_scaled.to_csv('y_val_scaled.csv', index = False)

X_train_val_scaled.to_csv('X_train_val_scaled.csv', index = False)
y_train_val_scaled.to_csv('y_train_val_scaled.csv', index = False)
X_test_scaled.to_csv('X_test_scaled.csv', index = False)
y_test_scaled.to_csv('y_test_scaled.csv', index = False)






# Prediction Data Set

In [None]:
predict_feature = pd.DataFrame(X_test_scaled[-6:-1].mean()).T
predict_feature

In [None]:

predict_feature.to_csv("predict_feature.csv", index = False)