In [111]:
import datetime as dt
from datetime import date
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn import preprocessing

# Data Input, EDA & Preprocessing

## Individual Data Input

In [112]:
price_vol = pd.read_csv('../data/Price & Volume BTC.csv')
tweets = pd.read_csv('../data/Tweets.csv')
funding_rates_1600 = pd.read_csv('../data/Funding Rates BTCUSDT 1600.csv')
funding_rates_0800 = pd.read_csv('../data/Funding Rates BTCUSDT 0800.csv')
funding_rates_0000 = pd.read_csv('../data/Funding Rates BTCUSDT 0000.csv')
google = pd.read_csv('../data/Google.csv')
transactions = pd.read_csv('../data/Transactions.csv')
unique_addresses = pd.read_csv('../data/Unique Addresses.csv')
active_addresses = pd.read_csv('../data/Active Addresses.csv')

## Data Compilation

In [113]:
# get daily ave value for twitter attributes
tweets = tweets.groupby(by="Time").mean()
tweets = tweets.reset_index()

df = pd.merge(price_vol, funding_rates_0000, on=['Time'])
df = pd.merge(df, funding_rates_0800, on=['Time'])
df = pd.merge(df, funding_rates_1600, on=['Time'])
df = pd.merge(df, google, on=['Time'])
df = pd.merge(df, active_addresses, on=['Time'])
df = pd.merge(df, unique_addresses, on=['Time'])
df = pd.merge(df, transactions, on=['Time'])
df = pd.merge(df, tweets, how="outer")
df.head()

Unnamed: 0,Time,Open,Close,High,Low,Volume,Funding Rate 0000,Funding Rate 0800,Funding Rate 1600,Bitcoin_SVI,Cryptocurrency_SVI,No. of Active Addresses,No. of Unique Addresses,Transactions,Favorites,Retweets,Compound,Negative,Positive,Neutral
0,2022-03-03,43896.8,42456.3,44088.0,41832.0,324545814.8,0.01%,0.01%,0.01%,13.84,24.08,1011448,701198.0,268244.0,443.269231,79.576923,0.109327,0.036269,0.063731,0.9
1,2022-03-02,44428.2,43896.9,45367.5,43332.8,340460526.5,0.00%,0.00%,0.01%,15.61,26.33,1091796,731603.0,288657.0,534.2,90.366667,0.068437,0.0371,0.068967,0.893967
2,2022-03-01,43174.8,44428.2,44966.0,42796.9,444858241.2,-0.01%,0.01%,0.01%,17.67,29.34,1065354,722863.0,280634.0,353.514286,61.028571,0.006329,0.062714,0.0702,0.867114
3,2022-02-28,37706.7,43179.8,44210.4,37446.5,564121052.5,0.00%,-0.02%,-0.01%,16.2,24.83,981066,702483.0,270918.0,332.333333,61.861111,0.135383,0.033806,0.073306,0.892861
4,2022-02-27,39117.3,37706.7,39864.0,36977.4,388368905.7,0.01%,0.01%,0.00%,12.96,21.07,768705,583361.0,211164.0,128.0,30.0,0.106867,0.022667,0.055,0.922333


In [114]:
print("Type of variables: ", "\n", df.dtypes)

Type of variables:  
 Time                        object
Open                       float64
Close                      float64
High                       float64
Low                        float64
Volume                     float64
Funding Rate 0000           object
Funding Rate 0800           object
Funding Rate 1600           object
Bitcoin_SVI                float64
Cryptocurrency_SVI         float64
No. of Active Addresses      int64
No. of Unique Addresses    float64
Transactions               float64
Favorites                  float64
Retweets                   float64
Compound                   float64
Negative                   float64
Positive                   float64
Neutral                    float64
dtype: object


# EDA & Preprocessing

In [115]:
# Select features, the target output is'Close'
feature_pool = ['Time', 'Favorites','Retweets','Compound','Negative','Positive',\
                'Neutral','Open', 'High','Low',\
                'Volume','Funding Rate 0000','Funding Rate 0800','Funding Rate 1600','Bitcoin_SVI',\
                'Cryptocurrency_SVI','No. of Active Addresses','No. of Unique Addresses','Transactions']

output = 'Close'

## Data Type Transformation

In [116]:
# Transform object type of "Funding Rate 0000", "Funding Rate 0800" and " Funding Rate 1600" to numerical type
df['Funding Rate 0000'] = df['Funding Rate 0000'].map(lambda x: float('nan') if pd.isnull(x) else float(x.replace('%','')))
df['Funding Rate 0800'] = df['Funding Rate 0800'].map(lambda x: float('nan') if pd.isnull(x) else float(x.replace('%','')))
df['Funding Rate 1600'] = df['Funding Rate 1600'].map(lambda x: float('nan') if pd.isnull(x) else float(x.replace('%','')))

df['Month'] = df['Time'].str[:7]
df['Time'] = df['Time'].map(lambda x: int('nan') if pd.isnull(x) else int(x.replace("-", "")))
df['Month'] = df['Month'].map(lambda x: int('nan') if pd.isnull(x) else int(x.replace("-", "")))

df.head(10)


Unnamed: 0,Time,Open,Close,High,Low,Volume,Funding Rate 0000,Funding Rate 0800,Funding Rate 1600,Bitcoin_SVI,...,No. of Active Addresses,No. of Unique Addresses,Transactions,Favorites,Retweets,Compound,Negative,Positive,Neutral,Month
0,20220303,43896.8,42456.3,44088.0,41832.0,324545814.8,0.01,0.01,0.01,13.84,...,1011448,701198.0,268244.0,443.269231,79.576923,0.109327,0.036269,0.063731,0.9,202203
1,20220302,44428.2,43896.9,45367.5,43332.8,340460526.5,0.0,0.0,0.01,15.61,...,1091796,731603.0,288657.0,534.2,90.366667,0.068437,0.0371,0.068967,0.893967,202203
2,20220301,43174.8,44428.2,44966.0,42796.9,444858241.2,-0.01,0.01,0.01,17.67,...,1065354,722863.0,280634.0,353.514286,61.028571,0.006329,0.062714,0.0702,0.867114,202203
3,20220228,37706.7,43179.8,44210.4,37446.5,564121052.5,0.0,-0.02,-0.01,16.2,...,981066,702483.0,270918.0,332.333333,61.861111,0.135383,0.033806,0.073306,0.892861,202202
4,20220227,39117.3,37706.7,39864.0,36977.4,388368905.7,0.01,0.01,0.0,12.96,...,768705,583361.0,211164.0,128.0,30.0,0.106867,0.022667,0.055,0.922333,202202
5,20220226,39219.6,39119.1,40300.0,38560.3,306410979.9,0.0,0.01,0.01,14.37,...,818914,582374.0,220423.0,863.8,178.6,0.13104,0.07,0.085,0.845,202202
6,20220225,38320.9,39219.6,39695.0,38015.0,386047125.7,-0.01,0.01,0.01,18.61,...,936543,687447.0,260340.0,453.1,83.85,0.1368,0.04195,0.0794,0.87865,202202
7,20220224,37248.2,38320.9,39823.3,34325.0,680985023.6,0.0,0.01,-0.02,27.43,...,958866,676575.0,254139.0,69.958333,22.375,-0.021021,0.074313,0.069271,0.856375,202202
8,20220223,38229.6,37248.1,39285.2,37046.8,331098529.4,0.0,0.0,-0.01,14.37,...,859810,622682.0,251275.0,59.8,19.485714,0.033834,0.051171,0.053629,0.8952,202202
9,20220222,37020.5,38229.6,38448.4,36333.1,324319840.5,0.0,-0.01,0.0,16.98,...,886571,647191.0,260265.0,193.058824,36.411765,0.185906,0.045647,0.086588,0.867706,202202


## Train-Test Split

In [117]:
X, y = df.drop(output, axis=1), df[output]

# Split the data: train set: test set using = 70/30. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
print(y_train.sum()/y_train.count(),y_test.sum()/y_test.count())

31807.179843444228 32393.359360730592


## Handle Missing Data

In [118]:
# Count missing value
print(df.isna().sum())

# check the consistency of columns with missing value between train set and the whole set
print(df[feature_pool].isnull().any().tolist()==X_train.isnull().any().tolist())
print(df[output].isnull().any().tolist()==y_train.isnull().any().tolist())

Time                         0
Open                         0
Close                        0
High                         0
Low                          0
Volume                       0
Funding Rate 0000            0
Funding Rate 0800            0
Funding Rate 1600            0
Bitcoin_SVI                  0
Cryptocurrency_SVI           0
No. of Active Addresses      0
No. of Unique Addresses      2
Transactions                 0
Favorites                  458
Retweets                   458
Compound                   458
Negative                   458
Positive                   458
Neutral                    458
Month                        0
dtype: int64
False
True


### Monthly Sentiment Analysis

In [119]:
tweets.insert(1, "Month", "NaN")

tweets['Month'] = tweets['Time'].str[:7]
tweets['Month'] = tweets['Month'].map(lambda x: int('nan') if pd.isnull(x) else int(x.replace("-", "")))
tweets.head(10)

Unnamed: 0,Time,Month,Favorites,Retweets,Compound,Negative,Positive,Neutral
0,2020-03-12,202003,5.0,2.0,0.0772,0.091,0.1,0.809
1,2020-03-17,202003,98.0,96.0,0.4019,0.0,0.114,0.886
2,2020-03-20,202003,125.0,95.0,0.4767,0.0,0.146,0.854
3,2020-03-30,202003,3.0,0.0,0.0803,0.061,0.071,0.869
4,2020-03-31,202003,186.0,142.0,0.6597,0.0,0.278,0.722
5,2020-04-01,202004,171.0,120.0,0.5267,0.061,0.184,0.754
6,2020-04-16,202004,419.0,94.0,0.0,0.0,0.0,1.0
7,2020-04-22,202004,73.0,18.0,0.3182,0.0,0.084,0.916
8,2020-04-30,202004,994.0,153.0,0.0,0.0,0.0,1.0
9,2020-05-09,202005,35.0,2.0,0.0,0.0,0.0,1.0


In [120]:
tweets = tweets.groupby(by="Month").mean()
tweets = tweets.reset_index()

In [121]:
df_X_train = pd.DataFrame(columns = ['Time', 'Month', 'Favorites', 'Retweets', 'Compound', \
                                 'Negative', 'Positive', 'Neutral', 'Open', 'High', \
                                 'Low', 'Volume', 'Funding Rate 0000', 'Funding Rate 0800', \
                                 'Funding Rate 1600', 'Bitcoin_SVI', 'Cryptocurrency_SVI',  'No. of Active Addresses', \
                                 'No. of Unique Addresses', 'Transactions'])

for i, row in X_train.iterrows():
    if (pd.isna(X_train.loc[i].at["Favorites"])):
        data = tweets[tweets['Month'] == X_train.loc[i].at["Month"]]
        new_row = {'Time': X_train.loc[i].at["Time"], 'Month': X_train.loc[i].at["Month"], 'Favorites': data.Favorites.values[0], 'Retweets': data.Retweets.values[0], 'Compound': data.Compound.values[0], \
                   'Negative': data.Negative.values[0], 'Positive': data.Positive.values[0], 'Neutral': data.Neutral.values[0], 'Open': X_train.loc[i].at["Open"], \
                   'High': X_train.loc[i].at["High"], 'Low': X_train.loc[i].at["Low"], 'Volume': X_train.loc[i].at["Volume"], 'Funding Rate 0000': X_train.loc[i].at["Funding Rate 0000"], \
                   'Funding Rate 0800': X_train.loc[i].at["Funding Rate 0800"], 'Funding Rate 1600': X_train.loc[i].at["Funding Rate 1600"], 'Bitcoin_SVI': X_train.loc[i].at["Bitcoin_SVI"], \
                   'Cryptocurrency_SVI': X_train.loc[i].at["Cryptocurrency_SVI"], 'No. of Active Addresses': X_train.loc[i].at["No. of Active Addresses"], 'No. of Unique Addresses': X_train.loc[i].at["No. of Unique Addresses"], \
                   'Transactions': X_train.loc[i].at["Transactions"]}
        df_X_train = df_X_train.append(new_row, ignore_index=True)
    else:
        new_row = {'Time': X_train.loc[i].at["Time"], 'Month': X_train.loc[i].at["Month"], 'Favorites': X_train.loc[i].at["Favorites"], 'Retweets': X_train.loc[i].at["Retweets"], 'Compound': X_train.loc[i].at["Compound"], \
                   'Negative': X_train.loc[i].at["Negative"], 'Positive': X_train.loc[i].at["Positive"], 'Neutral': X_train.loc[i].at["Neutral"], 'Open': X_train.loc[i].at["Open"],\
                   'High': X_train.loc[i].at["High"], 'Low': X_train.loc[i].at["Low"], 'Volume': X_train.loc[i].at["Volume"], 'Funding Rate 0000': X_train.loc[i].at["Funding Rate 0000"], \
                   'Funding Rate 0800': X_train.loc[i].at["Funding Rate 0800"], 'Funding Rate 1600': X_train.loc[i].at["Funding Rate 1600"], 'Bitcoin_SVI': X_train.loc[i].at["Bitcoin_SVI"], \
                   'Cryptocurrency_SVI': X_train.loc[i].at["Cryptocurrency_SVI"], 'No. of Active Addresses': X_train.loc[i].at["No. of Active Addresses"], 'No. of Unique Addresses': X_train.loc[i].at["No. of Unique Addresses"], \
                   'Transactions': X_train.loc[i].at["Transactions"]}
        df_X_train = df_X_train.append(new_row, ignore_index=True)


In [122]:
df_X_test = pd.DataFrame(columns = ['Time', 'Month', 'Favorites', 'Retweets', 'Compound', \
                                 'Negative', 'Positive', 'Neutral', 'Open', 'High', \
                                 'Low', 'Volume', 'Funding Rate 0000', 'Funding Rate 0800', \
                                 'Funding Rate 1600', 'Bitcoin_SVI', 'Cryptocurrency_SVI',  'No. of Active Addresses', \
                                 'No. of Unique Addresses', 'Transactions'])

for i, row in X_test.iterrows():
    if (pd.isna(X_test.loc[i].at["Favorites"])):
        data = tweets[tweets['Month'] == X_test.loc[i].at["Month"]]
        new_row = {'Time': X_test.loc[i].at["Time"], 'Month': X_test.loc[i].at["Month"], 'Favorites': data.Favorites.values[0], 'Retweets': data.Retweets.values[0], 'Compound': data.Compound.values[0], \
                   'Negative': data.Negative.values[0], 'Positive': data.Positive.values[0], 'Neutral': data.Neutral.values[0], 'Open': X_test.loc[i].at["Open"], \
                   'High': X_test.loc[i].at["High"], 'Low': X_test.loc[i].at["Low"], 'Volume': X_test.loc[i].at["Volume"], 'Funding Rate 0000': X_test.loc[i].at["Funding Rate 0000"], \
                   'Funding Rate 0800': X_test.loc[i].at["Funding Rate 0800"], 'Funding Rate 1600': X_test.loc[i].at["Funding Rate 1600"], 'Bitcoin_SVI': X_test.loc[i].at["Bitcoin_SVI"], \
                   'Cryptocurrency_SVI': X_test.loc[i].at["Cryptocurrency_SVI"], 'No. of Active Addresses': X_test.loc[i].at["No. of Active Addresses"], 'No. of Unique Addresses': X_test.loc[i].at["No. of Unique Addresses"], \
                   'Transactions': X_test.loc[i].at["Transactions"]}
        df_X_test = df_X_test.append(new_row, ignore_index=True)
    else:
        new_row = {'Time': X_test.loc[i].at["Time"], 'Month': X_test.loc[i].at["Month"], 'Favorites': X_test.loc[i].at["Favorites"], 'Retweets': X_test.loc[i].at["Retweets"], 'Compound': X_test.loc[i].at["Compound"], \
                   'Negative': X_test.loc[i].at["Negative"], 'Positive': X_test.loc[i].at["Positive"], 'Neutral': X_test.loc[i].at["Neutral"], 'Open': X_test.loc[i].at["Open"], \
                   'High': X_test.loc[i].at["High"], 'Low': X_test.loc[i].at["Low"], 'Volume': X_test.loc[i].at["Volume"], 'Funding Rate 0000': X_test.loc[i].at["Funding Rate 0000"], \
                   'Funding Rate 0800': X_test.loc[i].at["Funding Rate 0800"], 'Funding Rate 1600': X_test.loc[i].at["Funding Rate 1600"], 'Bitcoin_SVI': X_test.loc[i].at["Bitcoin_SVI"], \
                   'Cryptocurrency_SVI': X_test.loc[i].at["Cryptocurrency_SVI"], 'No. of Active Addresses': X_test.loc[i].at["No. of Active Addresses"], 'No. of Unique Addresses': X_test.loc[i].at["No. of Unique Addresses"], \
                   'Transactions': X_test.loc[i].at["Transactions"]}
        df_X_test = df_X_test.append(new_row, ignore_index=True)


In [123]:
X_test = df_X_test
X_test = X_test.drop(['Month'], axis=1)
X_test

Unnamed: 0,Time,Favorites,Retweets,Compound,Negative,Positive,Neutral,Open,High,Low,Volume,Funding Rate 0000,Funding Rate 0800,Funding Rate 1600,Bitcoin_SVI,Cryptocurrency_SVI,No. of Active Addresses,No. of Unique Addresses,Transactions
0,20211129.0,74.000000,14.000000,0.616600,0.000000,0.212000,0.788000,57275.5,58882.3,56664.5,4.083367e+08,0.01,0.01,0.01,14.25,43.47,993217.0,727580.0,301297.0
1,20200411.0,414.250000,96.250000,0.211225,0.015250,0.067000,0.917500,6859.0,6940.1,6762.4,1.055993e+07,0.01,0.01,0.01,7.11,4.88,634944.0,474445.0,247852.0
2,20210215.0,74.400000,21.400000,0.070720,0.063800,0.078800,0.857400,48587.2,49003.6,42841.6,1.889901e+08,0.15,0.12,0.04,26.28,24.80,1140491.0,769869.0,308496.0
3,20210612.0,1111.875000,148.500000,0.328937,0.067875,0.160125,0.772000,37332.6,37453.7,34625.5,2.198889e+08,0.00,-0.01,0.01,14.31,21.11,1013611.0,584529.0,222774.0
4,20200704.0,303.750000,79.000000,0.227450,0.000000,0.065250,0.934750,9059.0,9187.6,9044.3,1.136593e+07,0.01,0.01,0.01,3.81,3.64,799989.0,561108.0,288647.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,20201216.0,487.428571,56.571429,0.133693,0.058571,0.110143,0.831286,19425.5,21550.0,19283.7,5.499847e+07,0.01,0.01,0.01,24.48,9.40,1098912.0,751162.0,315217.0
215,20211124.0,1829.777778,203.222222,0.451078,0.000000,0.141778,0.858222,57540.2,57722.3,55868.0,4.577194e+08,0.01,0.01,0.01,16.20,58.24,1071921.0,737069.0,298909.0
216,20201105.0,614.000000,118.000000,0.296000,0.000000,0.109000,0.891000,14141.3,15746.2,14095.4,7.171100e+07,-0.02,0.01,0.01,2.66,0.37,1071518.0,729209.0,318668.0
217,20200924.0,63.500000,17.000000,0.121117,0.000000,0.035000,0.965000,10241.7,10792.5,10194.2,2.988144e+07,-0.01,0.00,0.01,4.32,4.33,943668.0,684217.0,316552.0


In [124]:
X_train = df_X_train
X_train = X_train.drop(['Month'], axis=1)
X_train

Unnamed: 0,Time,Favorites,Retweets,Compound,Negative,Positive,Neutral,Open,High,Low,Volume,Funding Rate 0000,Funding Rate 0800,Funding Rate 1600,Bitcoin_SVI,Cryptocurrency_SVI,No. of Active Addresses,No. of Unique Addresses,Transactions
0,20210117.0,1769.357143,184.857143,0.241271,0.002214,0.134714,0.863143,35992.7,36843.6,33804.9,1.223727e+08,0.07,0.03,0.03,19.08,13.01,1002014.0,642730.0,271874.0
1,20210627.0,1111.875000,148.500000,0.328937,0.067875,0.160125,0.772000,32283.7,34752.4,32001.2,2.055358e+08,-0.01,-0.02,-0.05,12.96,17.35,518219.0,379066.0,124640.0
2,20211005.0,106.000000,32.000000,-0.401900,0.145000,0.068000,0.787000,49223.7,51887.0,49034.7,4.332737e+08,0.01,0.01,0.00,15.61,33.58,1037679.0,729159.0,312538.0
3,20210822.0,3277.407576,582.525521,0.321660,0.036112,0.140890,0.822999,48821.3,49490.0,48034.0,2.785088e+08,0.02,0.01,0.01,11.88,23.54,715639.0,474281.0,190631.0
4,20200430.0,994.000000,153.000000,0.000000,0.000000,0.000000,1.000000,8777.5,9455.9,8388.0,7.696775e+07,0.01,0.01,0.10,13.20,5.22,947585.0,687395.0,321972.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,20211110.0,1124.000000,152.833333,0.333000,0.020167,0.119500,0.860333,66943.0,69028.0,62540.0,6.084604e+08,0.07,0.04,0.07,20.88,46.85,1022756.0,743397.0,301025.0
507,20210919.0,1870.122024,278.058036,0.376858,0.009379,0.122107,0.868536,48294.0,48366.5,46816.2,1.873398e+08,0.01,0.01,0.01,11.88,22.92,714326.0,501580.0,200516.0
508,20220219.0,352.000000,53.000000,-0.542300,0.238000,0.139000,0.623000,39976.6,40454.0,39632.0,1.500925e+08,0.00,0.01,0.00,11.40,20.63,839236.0,573700.0,220673.0
509,20211206.0,751.500000,32.000000,0.681350,0.000000,0.277500,0.722500,49398.5,50900.0,47120.3,6.261578e+08,0.01,0.01,0.01,13.98,33.38,1078373.0,701524.0,284467.0


### Unique Addresses

In [125]:
KNN_miss_filling = KNNImputer(n_neighbors=2).fit(X_train)
X_train = pd.DataFrame(KNN_miss_filling.transform(X_train), columns=feature_pool)
X_test = pd.DataFrame(KNN_miss_filling.transform(X_test), columns=feature_pool)

In [126]:
print(X_train.isna().sum())
print(X_test.isna().sum())

Time                       0
Favorites                  0
Retweets                   0
Compound                   0
Negative                   0
Positive                   0
Neutral                    0
Open                       0
High                       0
Low                        0
Volume                     0
Funding Rate 0000          0
Funding Rate 0800          0
Funding Rate 1600          0
Bitcoin_SVI                0
Cryptocurrency_SVI         0
No. of Active Addresses    0
No. of Unique Addresses    0
Transactions               0
dtype: int64
Time                       0
Favorites                  0
Retweets                   0
Compound                   0
Negative                   0
Positive                   0
Neutral                    0
Open                       0
High                       0
Low                        0
Volume                     0
Funding Rate 0000          0
Funding Rate 0800          0
Funding Rate 1600          0
Bitcoin_SVI                0
C

In [127]:
X_test.to_csv('../data/X_test.csv', index=False)
X_train.to_csv('../data/X_train.csv', index=False)
y_train.to_csv('../data/Y_train.csv', index=False)
y_test.to_csv('../data/Y_test.csv', index=False)

## Preprocessing

In [128]:
# Standardized features:
standardized_features = ['Favorites','Retweets','Compound','Negative','Positive','Neutral','Bitcoin_SVI']     
non_standardized_features = list(set(feature_pool)-set(standardized_features))
    
# Get the scaler based on train set
scaler = preprocessing.StandardScaler().fit(X_train[standardized_features])
print('The mean and standard deviation of training set:', scaler.mean_, scaler.scale_)

# Standardize the features of train and test sets by the scaler of training set
X_train_std=pd.DataFrame(scaler.transform(X_train[standardized_features]))  # transform() return 'numpy.ndarray', not 'DataFrame' or 'Series'
X_train_nstd=pd.DataFrame(X_train[non_standardized_features])

X_test_std=pd.DataFrame(scaler.transform(X_test[standardized_features]))  # return 'numpy.ndarray', not 'DataFrame' or 'Series'
X_test_nstd=pd.DataFrame(X_test[non_standardized_features])

# Rename the columns of X_train_std and X_test_std
X_train_std.columns = X_train_std.columns.map(lambda x: standardized_features[x])
X_test_std.columns = X_test_std.columns.map(lambda x: standardized_features[x])

# Re-combine the normalized and non-normalized data together for both train ans test sets (3 pts)
X_train_std.reset_index(drop=True, inplace=True)
X_train_nstd.reset_index(drop=True, inplace=True)
X_train = pd.concat([X_train_std,X_train_nstd], sort=False,axis=1)
X_test_std.reset_index(drop=True, inplace=True)
X_test_nstd.reset_index(drop=True, inplace=True)
X_test = pd.concat([X_test_std,X_test_nstd], sort=False,axis=1)

# Update the order within feature pool
feature_pool = list(X_train.columns)

The mean and standard deviation of training set: [1.12319708e+03 1.78854921e+02 2.71157625e-01 2.46952783e-02
 1.15790902e-01 8.59519204e-01 1.31478278e+01] [1.33102968e+03 2.10454468e+02 2.22077480e-01 3.59520821e-02
 6.64394072e-02 7.48734588e-02 7.79264349e+00]


In [129]:
X_test.to_csv('../data/X_test_scaled.csv', index=False)
X_train.to_csv('../data/X_train_scaled.csv', index=False)
y_train.to_csv('../data/Y_train_scaled.csv', index=False)
y_test.to_csv('../data/Y_test_scaled.csv', index=False)