# Predicting Change in Daily Bitcoin Prices using Social Media Interest Measures

Here, we train a classifier to use Daily twitter sentiment scores, daily reddit sentiment scores, and daily google trend scores to categorize the change in bitcoin prices into one of 5 categories: highly negative (-2), moderately negative (-1), no commendable change (0), moderately positive (1), highly negative (2)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Prepare Data

In [None]:
#Import Data

bitcoin = pd.read_csv('categorical_bitcoin_price.csv', sep='\t', index_col=0)
reddit = pd.read_csv('redditDailySentiment.csv', sep='\t', index_col=None)
trends= pd.read_csv('trends_daily_score.csv', sep='\t', index_col=0)
twitter = pd.read_csv('twitterSentiment.csv', sep=',', index_col=None)

In [None]:
bitcoin.head()

Unnamed: 0,Date,open,high,low,close,volume,Diff_Open,Diff_Category
0,2021-01-01,28923.63,29600.0,28624.57,29331.69,54182.925011,,0
1,2021-01-02,29331.7,33300.0,28946.53,32178.33,129993.873362,408.07,0
2,2021-01-03,32176.45,34778.11,31962.99,33000.05,120957.56675,2844.75,1
3,2021-01-04,33000.05,33600.0,28130.0,31988.71,140899.88569,823.6,1
4,2021-01-05,31989.75,34360.0,29900.0,33949.53,116049.997038,-1010.3,-1


In [None]:
reddit.head()

Unnamed: 0,Day,neg,neu,pos,compound
0,2021-01-01,0.05239,0.771753,0.172513,0.238452
1,2021-01-02,0.057414,0.811156,0.121446,0.135624
2,2021-01-03,0.063241,0.78381,0.148659,0.152421
3,2021-01-04,0.061042,0.815152,0.121297,0.167221
4,2021-01-05,0.06179,0.801428,0.135118,0.147205


In [None]:
trends.head()

Unnamed: 0,date,avg_score,weighted_score
0,2021-01-01,12.660625,14.447561
1,2021-01-02,34.144375,37.360744
2,2021-01-03,48.853125,53.128124
3,2021-01-04,32.544375,35.352064
4,2021-01-05,24.34125,26.472194


In [None]:
twitter.head()

Unnamed: 0,Day,neg,neu,pos,compound
0,2021-02-05,0.030948,0.892471,0.076579,0.125674
1,2021-02-06,0.028127,0.896185,0.07568,0.129964
2,2021-02-07,0.028718,0.890963,0.080317,0.13965
3,2021-02-08,0.025719,0.887855,0.086424,0.157384
4,2021-02-09,0.027866,0.890614,0.081516,0.149618


In [None]:
#Rename merging column ("Date")
reddit = reddit.rename(columns={"Day": "Date"})
trends = trends.rename(columns={"date": "Date"})
twitter = twitter.rename(columns={"Day": "Date"})

In [None]:
#Combine Data into dataframe

data = pd.DataFrame()
data["Date"] = bitcoin["Date"]
data = data.merge(twitter[['Date','compound']], on='Date',how='outer')
data = data.merge(reddit[['Date','compound']], on='Date', how='outer')
data = data.merge(trends[['Date', 'avg_score']], on='Date',how='outer')
data = data.merge(bitcoin[['Date','Diff_Category']], on='Date',how='outer')
data.head()

Unnamed: 0,Date,compound_x,compound_y,avg_score,Diff_Category
0,2021-01-01,,0.238452,12.660625,0.0
1,2021-01-02,,0.135624,34.144375,0.0
2,2021-01-03,,0.152421,48.853125,1.0
3,2021-01-04,,0.167221,32.544375,1.0
4,2021-01-05,,0.147205,24.34125,-1.0


In [None]:
#Rename columns, drop unnecessary columns
'''
data = data.drop(['weighted_score', 'high', 'low','close','volume'],axis=1)
data = data.rename(columns={'neg_x':'neg_twitter', 'neu_x':'neu_twitter','pos_x':'pos_twitter','compound_x':'compound_twitter','neg_y':'neg_reddit', 'neu_y':'neu_reddit','pos_y':'pos_reddit','compound_y':'compound_reddit','avg_score':'avg_trend','Diff_Open':'diff_open','Diff_Category':'label'})
data.head()
'''

"\ndata = data.drop(['weighted_score', 'high', 'low','close','volume'],axis=1)\ndata = data.rename(columns={'neg_x':'neg_twitter', 'neu_x':'neu_twitter','pos_x':'pos_twitter','compound_x':'compound_twitter','neg_y':'neg_reddit', 'neu_y':'neu_reddit','pos_y':'pos_reddit','compound_y':'compound_reddit','avg_score':'avg_trend','Diff_Open':'diff_open','Diff_Category':'label'})\ndata.head()\n"

In [None]:
data['Diff_Category'].value_counts()

-1.0    88
 1.0    84
 0.0    59
 2.0     8
-2.0     4
Name: Diff_Category, dtype: int64

In [None]:
#Drop all values for which we dont have the bitcoin price category
data = data.dropna(subset=['Diff_Category'])
#drop all values for which both twitter and reddit scores are missing
data = data.dropna(thresh=4)
#Fill NAN values with the mean
data = data.fillna(data.mean())
data.tail()

Unnamed: 0,Date,compound_x,compound_y,avg_score,Diff_Category
232,2021-08-21,0.263556,0.119886,18.431875,1.0
234,2021-08-23,0.231269,0.119886,27.978125,0.0
235,2021-08-24,0.283053,0.119886,19.946875,0.0
236,2021-08-25,0.383225,0.119886,15.628125,-1.0
237,2021-08-26,0.321619,0.119886,13.03125,1.0


In [None]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_x','compound_y','avg_score']]
y = data[['Diff_Category']]

#we need to use the y value for the NEXT day as the classification variable
X = X.iloc[:-1,:]
y= y.iloc[1:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state=7)

## Training the Random Forest Classifier

Reference: https://www.geeksforgeeks.org/random-forest-classifier-using-scikit-learn/

In [None]:
#Train Model
clf = RandomForestClassifier(n_estimators = 10) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
#Reference: https://scikit-learn.org/stable/modules/model_evaluation.html

#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.5454545454545454
F1-Score:  0.5454545454545454
Precision:  0.7532467532467532
Recall:  0.5454545454545454


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Training a Support Vector Classifier

Reference: https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

In [None]:
#Train Model
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [None]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.2727272727272727
F1-Score:  0.11688311688311687
Precision:  0.07438016528925619
Recall:  0.2727272727272727


  _warn_prf(average, modifier, msg_start, len(result))


## Decision Tree Classifier

In [158]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [159]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.45454545454545453
F1-Score:  0.4646464646464646
Precision:  0.7727272727272727
Recall:  0.45454545454545453


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparamter Tuning on Random Forest Classifier

In [161]:
#Train Model
clf = RandomForestClassifier(n_estimators = 10) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.5454545454545454
F1-Score:  0.5454545454545454
Precision:  0.6000000000000001
Recall:  0.5454545454545454


  This is separate from the ipykernel package so we can avoid doing imports until
  _warn_prf(average, modifier, msg_start, len(result))


In [162]:
# Current parameters in use
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(clf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [163]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [False, True],
    'max_depth': [10,50,5],
    'max_features': [2, 3, 'auto'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2,3, 5, 7],
    'n_estimators': [10,20,30]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [164]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [False, True], 'max_depth': [10, 50, 5],
                         'max_features': [2, 3, 'auto'],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 3, 5, 7],
                         'n_estimators': [10, 20, 30]},
             verbose=2)

In [165]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 10,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 7,
 'n_estimators': 10}

In [169]:
#Train Model
clf = RandomForestClassifier(bootstrap=False, max_depth=10, max_features=3 , min_samples_leaf=1, min_samples_split=7, n_estimators = 10) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy: 0.56625636363636365
F1-Score: 0.48038961038961036
Precision: 0.5890606060606060
Recall: 0.41373636363636365


# 3-way Classification

Here, we attempt to categorize change in bitcoin prices to one of 3 categories: negative change (-1), no change (0), positive change (1)

In [None]:
data.head()

Unnamed: 0,Date,compound_x,compound_y,avg_score,Diff_Category
0,2021-01-01,0.208399,0.238452,12.660625,0.0
1,2021-01-02,0.208399,0.135624,34.144375,0.0
2,2021-01-03,0.208399,0.152421,48.853125,1.0
3,2021-01-04,0.208399,0.167221,32.544375,1.0
4,2021-01-05,0.208399,0.147205,24.34125,-1.0


In [None]:
#Transform Label
data.loc[data['Diff_Category'] >0, 'label_3'] = 1  
data.loc[data['Diff_Category'] <0, 'label_3'] = -1  
data.loc[data['Diff_Category'] ==0, 'label_3'] = 0

data.head()

Unnamed: 0,Date,compound_x,compound_y,avg_score,Diff_Category,label_3
0,2021-01-01,0.208399,0.238452,12.660625,0.0,0.0
1,2021-01-02,0.208399,0.135624,34.144375,0.0,0.0
2,2021-01-03,0.208399,0.152421,48.853125,1.0,1.0
3,2021-01-04,0.208399,0.167221,32.544375,1.0,1.0
4,2021-01-05,0.208399,0.147205,24.34125,-1.0,-1.0


In [None]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_x','compound_y','avg_score']]
y = data[['label_3']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05)

In [None]:
#Train Decision Tree
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [None]:
#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.2727272727272727
F1-Score:  0.25757575757575757
Precision:  0.36363636363636365
Recall:  0.2727272727272727



# Predicting Change in Hourly Bitcoin Prices using Social Media Interest Measures
Here, we train a classifier to use Daily hourly sentiment scores, hourly reddit sentiment scores, and hourly google trend scores to categorize the change in bitcoin prices into one of 3 categories: negative (-1), no commendable change (0), positive (1)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Prepare Data

In [None]:
#Import Data

bitcoin = pd.read_csv('categorical_bitcoin_hourly.csv', sep='\t', index_col=None)
reddit = pd.read_csv('redditHourlySentiment.csv', sep='\t', index_col=None)
trends= pd.read_csv('trends_hourly_score.csv', sep='\t', index_col=None)
twitter = pd.read_csv('twitterSentiment_hourly.csv', sep=',', index_col=None)

In [None]:
bitcoin.tail()

Unnamed: 0,date,open,high,low,close,Volume BTC,Volume USD,Diff_Open,Diff_Category
5827,2021-01-01 04:00:00,29351.85,29458.27,29121.8,29290.38,325.5657,9535943.0,57.27,0
5828,2021-01-01 03:00:00,29249.33,29414.48,29216.61,29345.63,284.146568,8338460.0,-102.52,0
5829,2021-01-01 02:00:00,29484.39,29535.95,29189.34,29249.33,290.242637,8489403.0,235.06,1
5830,2021-01-01 01:00:00,29070.66,29543.93,29010.36,29479.12,648.597652,19120090.0,-413.73,-1
5831,2021-01-01 00:00:00,28999.63,29099.0,28774.64,29056.94,397.964775,11563640.0,-71.03,0


In [None]:
reddit = reddit.rename(columns={'Hour': 'date'})
reddit.tail()

Unnamed: 0,date,neg,neu,pos,compound
1547,2021-08-30 23:00:00,0.019,0.862,0.119,0.7579
1548,2021-08-31 00:00:00,0.019,0.862,0.119,0.7579
1549,2021-08-31 01:00:00,0.019,0.862,0.119,0.7579
1550,2021-08-31 02:00:00,0.019,0.862,0.119,0.7579
1551,2021-08-31 03:00:00,0.019,0.862,0.119,0.7579


In [None]:
trends.tail()

Unnamed: 0,date,bitcoin,btc,bitcoin price,bitcoin kurs,bitcoin usd,bitcoin stock,bitcoin dollar,bitcoin euro,buy bitcoin,buy btc,btc usd,btc inr,price btc,btc stock,btc coin,btc euro,avg_score
5827,2021-08-31 19:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875
5828,2021-08-31 20:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875
5829,2021-08-31 21:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875
5830,2021-08-31 22:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875
5831,2021-08-31 23:00:00,12.07,14.07,9.9,8.36,10.88,8.1,15.96,21.76,12.6,5.89,11.76,3.8,6.24,7.81,9.01,28.86,11.691875


In [None]:
twitter.tail(20)

Unnamed: 0,Hour,neg,neu,pos,compound
1971,2021-11-19 15:00:00,0.032943,0.864558,0.102496,0.213237
1972,2021-11-19 16:00:00,0.026469,0.872561,0.100972,0.236397
1973,2021-11-19 17:00:00,0.032144,0.852734,0.115126,0.258893
1974,2021-11-19 18:00:00,0.040772,0.854975,0.10425,0.179953
1975,2021-11-19 19:00:00,0.031683,0.865458,0.102859,0.233449
1976,2021-11-19 20:00:00,0.028347,0.871458,0.100195,0.241766
1977,2021-11-19 21:00:00,0.028735,0.879008,0.092264,0.215446
1978,2021-11-19 22:00:00,0.033254,0.865578,0.101166,0.244083
1979,2021-11-19 23:00:00,0.026742,0.875645,0.097604,0.228104
1980,"['Airdrop', 'Airdrops', 'Airdropinspector', 'B...",0.0,1.0,0.0,0.0


In [None]:
twitter = twitter.iloc[:1980,:]
twitter = twitter.rename(columns={'Hour':'date'})

In [None]:
twitter.tail()

Unnamed: 0,date,neg,neu,pos,compound
1975,2021-11-19 19:00:00,0.031683,0.865458,0.102859,0.233449
1976,2021-11-19 20:00:00,0.028347,0.871458,0.100195,0.241766
1977,2021-11-19 21:00:00,0.028735,0.879008,0.092264,0.215446
1978,2021-11-19 22:00:00,0.033254,0.865578,0.101166,0.244083
1979,2021-11-19 23:00:00,0.026742,0.875645,0.097604,0.228104


In [None]:
print(bitcoin.shape[0])
print(trends.shape[0])
print(reddit.shape[0])
print(twitter.shape[0])

5832
5832
1552
1980


In [None]:
#Combine Data into dataframe using outer join


data = pd.DataFrame()
data["date"] = bitcoin["date"]
data = data.merge(bitcoin[['date','Diff_Category']], on='date')
data = data.merge(trends[['date','avg_score']], on='date', how='outer')
data = data.merge(reddit[['date','compound']], on='date', how='outer')
data = data.merge(twitter[['date','compound']], on='date', how='outer')

data.tail()

Unnamed: 0,date,Diff_Category,avg_score,compound_x,compound_y
6179,2021-11-19 19:00:00,,,,0.233449
6180,2021-11-19 20:00:00,,,,0.241766
6181,2021-11-19 21:00:00,,,,0.215446
6182,2021-11-19 22:00:00,,,,0.244083
6183,2021-11-19 23:00:00,,,,0.228104


In [None]:
#Drop all values for which we dont have the bitcoin price category
data = data.dropna(subset=['Diff_Category'])
data.tail()

Unnamed: 0,date,Diff_Category,avg_score,compound_x,compound_y
5827,2021-01-01 04:00:00,0.0,12.660625,,
5828,2021-01-01 03:00:00,0.0,12.660625,,
5829,2021-01-01 02:00:00,1.0,12.660625,,
5830,2021-01-01 01:00:00,-1.0,12.660625,,
5831,2021-01-01 00:00:00,0.0,12.660625,,


In [None]:
#drop all values for which both twitter and reddit scores are missing
data = data.dropna(thresh=4)
data.tail()

Unnamed: 0,date,Diff_Category,avg_score,compound_x,compound_y
5795,2021-01-02 12:00:00,-1.0,34.144375,0.135536,
5796,2021-01-02 11:00:00,0.0,34.144375,0.265852,
5803,2021-01-02 04:00:00,0.0,34.144375,0.087468,
5811,2021-01-01 20:00:00,-1.0,12.660625,0.232856,
5819,2021-01-01 12:00:00,0.0,12.660625,0.249644,


In [None]:
#Fill NAN values with the mean
data = data.fillna(data.mean())
data.tail()

Unnamed: 0,date,Diff_Category,avg_score,compound_x,compound_y
5795,2021-01-02 12:00:00,-1.0,34.144375,0.135536,0.208859
5796,2021-01-02 11:00:00,0.0,34.144375,0.265852,0.208859
5803,2021-01-02 04:00:00,0.0,34.144375,0.087468,0.208859
5811,2021-01-01 20:00:00,-1.0,12.660625,0.232856,0.208859
5819,2021-01-01 12:00:00,0.0,12.660625,0.249644,0.208859


In [None]:
#get training and test data
#Using only compound sentiment score, and average google trend score
X = data[['compound_x','compound_y','avg_score']]
y = data[['Diff_Category']]

#we need to use the y value for the NEXT hour as the classification variable
X = X.iloc[:-1,:]
y= y.iloc[1:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state=7)

In [None]:
#trying with all attributes
#get training and test data
'''
cols = data.columns
X = data[cols[2:]]
y = data[['Diff_Category']]

#we need to use the y value for the NEXT hour as the classification variable
X = X.iloc[:-1,:]
y= y.iloc[1:,:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=7)
'''

"\ncols = data.columns\nX = data[cols[2:]]\ny = data[['Diff_Category']]\n\n#we need to use the y value for the NEXT hour as the classification variable\nX = X.iloc[:-1,:]\ny= y.iloc[1:,:]\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=7)\n"

## Training a Decision Tree Classifier

In [None]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

In [None]:
#Evaluate Model
print("Train Accuracy: ",clf.score(X_train, y_train) )
print("Test Accuracy: ", metrics.accuracy_score(y_test.values, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Train Accuracy:  0.46709470304975925
Test Accuracy:  0.5378787878787878
F1-Score:  0.4898744824016563
Precision:  0.4933833217081298
Recall:  0.5378787878787878


## Training a Random Forrest Classifier

In [None]:
#Train Model
clf = RandomForestClassifier(n_estimators = 5) 
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.4090909090909091
F1-Score:  0.41714956583377627
Precision:  0.4310430353057081
Recall:  0.4090909090909091


  This is separate from the ipykernel package so we can avoid doing imports until


## Training an SVM Classifier

In [None]:
#Train Model
clf = SVC(kernel="linear", C=0.025)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)


#Evaluate Model
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

  y = column_or_1d(y, warn=True)


Accuracy:  0.5
F1-Score:  0.37482399755127027
Precision:  0.3537401863673049
Recall:  0.5


  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparamter Tuning for Decision Tree Model

In [None]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

#Evaluate Model
print("Train Accuracy: ",clf.score(X_train, y_train) )
print("Test Accuracy: ", metrics.accuracy_score(y_test.values, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Train Accuracy:  0.46709470304975925
Test Accuracy:  0.5378787878787878
F1-Score:  0.4898744824016563
Precision:  0.4933833217081298
Recall:  0.5378787878787878


In [None]:
# Current parameters in use
from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(clf.get_params())

Parameters currently in use:

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}


In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [5,10,15],
    'max_features': [None, 2, 3],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2,3,5],
    'splitter' : ['best','random']
}
# Create a based model
clf = DecisionTreeClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(max_depth=5), n_jobs=-1,
             param_grid={'max_depth': [5, 10, 15], 'max_features': [None, 2, 3],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 3, 5],
                         'splitter': ['best', 'random']},
             verbose=2)

In [None]:
grid_search.best_params_

{'max_depth': 5,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 5,
 'splitter': 'random'}

In [None]:
#Train Model
clf = DecisionTreeClassifier(max_depth=5, max_features=2, min_samples_leaf=3, min_samples_split=5, splitter='random')
clf.fit(X_train, y_train)

#Predict
y_pred = clf.predict(X_test)

#Evaluate Model
print("Train Accuracy: ",clf.score(X_train, y_train) )
print("Test Accuracy: ", metrics.accuracy_score(y_test.values, y_pred))
print("F1-Score: ", metrics.f1_score(y_test, y_pred, average='weighted'))
print("Precision: ", metrics.precision_score(y_test, y_pred, average='weighted'))
print("Recall: ", metrics.recall_score(y_test, y_pred, average='weighted'))

Train Accuracy: 0.4886035313001605
Test Accuracy: 0.5878787878787878
F1-Score: 0.4956976396415797
Precision: 0.43320335011557585
Recall: 0.5578787878787878
