<a href="https://colab.research.google.com/github/alisdghnia/Crypto-Whitepaper-Research/blob/main/BERT_Classification_%26_Conv1D_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import normalize

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

import re
import os
import tqdm
import string

In [3]:
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    ROOT_PATH = '/content/drive/MyDrive/'
except ImportError:
    ROOT_PATH = '../'
DATA_FOLDER = os.path.join(ROOT_PATH, 'data/public')

Mounted at /content/drive/


In [4]:
research_path = DATA_FOLDER + '/CryptoData/'

In [5]:
text_data = pd.read_hdf(os.path.join(research_path,'all text.h5'))
meta_data = pd.read_csv(os.path.join(research_path,'Metadata 1838 Whitepapers.csv'))
success = pd.read_csv(os.path.join(research_path, 'Coin-Price-Match 0-1838 - ALL.csv'))

In [6]:
text_data.reset_index(inplace=True)
text_data.rename(columns= {'index' : 'Name'}, inplace=True)

In [7]:
wordCount = []
for i in range(len(text_data)):
    wordCount.append(len(text_data.test[i]))

meta_data['Word Count'] = wordCount
meta_data.Name = meta_data.Name.apply(lambda x: x.replace('.pdf', ''))

In [8]:
df_meta_text = pd.merge(meta_data, text_data, left_on= 'Name', right_on= 'Name')

In [9]:
df_meta_text = df_meta_text[df_meta_text['Flesch Reading Ease'] > 0]
df_meta_text = df_meta_text[df_meta_text['Flesch Reading Ease'] < 100]

In [10]:
df_meta_text = df_meta_text[df_meta_text['New Dale-Chall'] > 0]
df_meta_text = df_meta_text[df_meta_text['New Dale-Chall'] < 10]

In [11]:
df_meta_text.reset_index(inplace=True)
df_meta_text.drop(columns = ['Unnamed: 0'], inplace=True)

In [12]:
df_meta_text.drop(columns = ['index'], inplace=True)

In [13]:
success.drop(columns= 'Unnamed: 0', inplace=True)
success.coin_whitepaper_name = success.coin_whitepaper_name.apply(lambda x: x.replace('.pdf', ''))

In [14]:
def ConvertInt(x):
    try:
        if type(x) == str:
            x = x.replace('$','')
            x = x.replace(',','')
            x = x.replace('Nan', '0')
            x = x.replace("- -", '0')
            x = x.strip()
            x = float(x)
    except Exception:
        x = 0

    return x

In [15]:
success_copy = success.copy()

In [16]:
success_copy.market_cap = success_copy.market_cap.apply(lambda x: ConvertInt(x))
success_copy.market_cap.fillna(value=0, inplace=True)
success_copy.market_cap = pd.to_numeric(success_copy.market_cap)

In [17]:
success_copy.volume = success_copy.volume.apply(lambda x: ConvertInt(x))
success_copy.volume.fillna(value=0, inplace=True)
success_copy.volume = pd.to_numeric(success_copy.volume)

In [18]:
success_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1838 entries, 0 to 1837
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   coin_whitepaper_name      1838 non-null   object 
 1   coin_name_on_web          1820 non-null   object 
 2   coin_symbol               1820 non-null   object 
 3   coin_price                1838 non-null   object 
 4   market_cap                1838 non-null   float64
 5   volume                    1838 non-null   float64
 6   max_supply                1286 non-null   object 
 7   total_supply              1286 non-null   object 
 8   fully_diluted_market_cap  1298 non-null   object 
dtypes: float64(2), object(7)
memory usage: 129.4+ KB


In [19]:
all_df = pd.merge(df_meta_text, success_copy, left_on = 'Name', right_on = 'coin_whitepaper_name')

In [20]:
mc_median = np.median(all_df.market_cap)
mc_median

168140.5

In [21]:
volume_median = np.median(all_df.volume)
volume_median

80.5

In [22]:
all_df['success_marketcap_median'] = np.where(all_df.market_cap < mc_median, 0, 1)
# all_df['success_price_median'] = np.where(all_df.coin_price <= price_median, 0, 1)
all_df['success_volume_median'] = np.where(all_df.volume < volume_median, 0, 1)

In [23]:
df = all_df.loc[:, ['test', 'success_marketcap_median']]

In [24]:
df

Unnamed: 0,test,success_marketcap_median
0,​v bitbase​ ​whitepaper​ ​ new​ ​escrow​ ​bloc...,0
1,table of contents abstract introduction nrp to...,0
2,idena concept paper idena concept paper draft ...,1
3,w t p a p r november version new generation we...,0
4,chain decentralizing storage saswata basu tom ...,1
...,...,...
1695,version the core of cartesi augusto teixeira d...,1
1696,bitcoen white paper contents short description...,1
1697,social betting networkpowered by blockchain ta...,0
1698,white paper introduccidon blockchain review ma...,0


In [25]:
df.groupby('success_marketcap_median').describe()

Unnamed: 0_level_0,test,test,test,test
Unnamed: 0_level_1,count,unique,top,freq
success_marketcap_median,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,850,792,white paper v abstract in the white paper we a...,3
1,850,789,bitcoin a peertopeer electronic cash system sa...,6


In [26]:
df['success_marketcap_median'].value_counts()

0    850
1    850
Name: success_marketcap_median, dtype: int64

In [93]:
X_train, X_test, y_train, y_test = train_test_split(df['test'],df['success_marketcap_median'])

In [47]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")



In [48]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [49]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [50]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [51]:
model.fit(X_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd72e8041c0>

In [52]:
model.evaluate(X_test, y_test)



[0.6861155033111572, 0.522352933883667, 0.5096418857574463, 0.8809523582458496]

In [31]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score

In [54]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted
cm = confusion_matrix(y_test, y_predicted)
cm



array([[ 37, 178],
       [ 25, 185]])

In [61]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.34      0.10      0.16       204
           1       0.50      0.81      0.62       221

    accuracy                           0.47       425
   macro avg       0.42      0.46      0.39       425
weighted avg       0.42      0.47      0.40       425



In [30]:
from keras.layers import Dense, Reshape, Input, Concatenate, BatchNormalization, Dropout, Conv1D, Flatten, MaxPooling1D, Activation, LeakyReLU, LSTM, Bidirectional
from keras.models import Model
from tqdm import tqdm

In [62]:
tfidf = TfidfVectorizer(max_df=5, min_df=5, lowercase= True, stop_words='english')
features = tfidf.fit_transform(df_meta_text['test']).toarray()
words = tfidf.get_feature_names()
# labels = df.category_id
print(features.shape)

tfidf_df = pd.DataFrame(features, columns=words)

(1700, 2690)




In [94]:
X = tfidf_df

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X,df['success_marketcap_median'])

In [81]:
features = features.reshape(features.shape[0], features.shape[1], 1)

In [83]:
features.shape

(1700, 2690, 1)

In [110]:
model = tf.keras.Sequential()

model.add(Conv1D(32, kernel_size = 3, activation = 'relu', input_shape = (2690,1)))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size = 3, activation = 'relu', input_shape = (1,)))
model.add(MaxPooling1D())
model.add(Dropout(0.2))

model.add(Conv1D(32, kernel_size = 3, activation = 'relu', input_shape = (1,)))
model.add(Dense(16, activation = 'relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.2))

model.add(Flatten())

model.add(Dense(1, activation= 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics= METRICS)

In [111]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_26 (Conv1D)          (None, 2688, 32)          128       
                                                                 
 dropout_3 (Dropout)         (None, 2688, 32)          0         
                                                                 
 conv1d_27 (Conv1D)          (None, 2686, 64)          6208      
                                                                 
 max_pooling1d_13 (MaxPoolin  (None, 1343, 64)         0         
 g1D)                                                            
                                                                 
 dropout_4 (Dropout)         (None, 1343, 64)          0         
                                                                 
 conv1d_28 (Conv1D)          (None, 1341, 32)          6176      
                                                     

In [122]:
model.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd72382a580>

In [123]:
model.evaluate(X_test, y_test)



[0.509105384349823, 0.7835294008255005, 0.8061224222183228, 0.7452830076217651]

In [124]:
y_predicted = model.predict(X_test)
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted
cm = confusion_matrix(y_test, y_predicted)
cm



array([[175,  38],
       [ 54, 158]])

In [125]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.76      0.82      0.79       213
           1       0.81      0.75      0.77       212

    accuracy                           0.78       425
   macro avg       0.79      0.78      0.78       425
weighted avg       0.79      0.78      0.78       425



In [136]:
X_train, X_test, y_train, y_test = train_test_split(X.iloc[:1200, :],df.loc[:1199, 'success_marketcap_median'])

In [137]:
model = tf.keras.Sequential()

model.add(Conv1D(32, kernel_size = 3, activation = 'relu', input_shape = (2690,1)))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size = 3, activation = 'relu', input_shape = (1,)))
model.add(MaxPooling1D())
model.add(Dropout(0.2))

model.add(Conv1D(32, kernel_size = 3, activation = 'relu', input_shape = (1,)))
model.add(Dense(16, activation = 'relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.2))

model.add(Flatten())

model.add(Dense(1, activation= 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics= METRICS)

In [138]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_29 (Conv1D)          (None, 2688, 32)          128       
                                                                 
 dropout_6 (Dropout)         (None, 2688, 32)          0         
                                                                 
 conv1d_30 (Conv1D)          (None, 2686, 64)          6208      
                                                                 
 max_pooling1d_15 (MaxPoolin  (None, 1343, 64)         0         
 g1D)                                                            
                                                                 
 dropout_7 (Dropout)         (None, 1343, 64)          0         
                                                                 
 conv1d_31 (Conv1D)          (None, 1341, 32)          6176      
                                                     

In [139]:
for i in range(3):
    X_train, X_test, y_train, y_test = train_test_split(X.iloc[:1200, :],df.loc[:1199, 'success_marketcap_median'])
    model.fit(X_train, y_train, epochs=5)
    model.evaluate(X_test, y_test)
    i+=1

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [146]:
y_predicted = model.predict(X.iloc[1200:, :])
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted
cm = confusion_matrix(df.loc[1200:, 'success_marketcap_median'], y_predicted)
cm



array([[133, 121],
       [135, 111]])

In [148]:
print(classification_report(df.loc[1200:, 'success_marketcap_median'], y_predicted))

              precision    recall  f1-score   support

           0       0.50      0.52      0.51       254
           1       0.48      0.45      0.46       246

    accuracy                           0.49       500
   macro avg       0.49      0.49      0.49       500
weighted avg       0.49      0.49      0.49       500



In [27]:
all_df

Unnamed: 0,Name,File Size (MB),Number of Pages,Word Count,Flesch Reading Ease,New Dale-Chall,Abstract Section,Summary Section,Reference Section,test,...,coin_name_on_web,coin_symbol,coin_price,market_cap,volume,max_supply,total_supply,fully_diluted_market_cap,success_marketcap_median,success_volume_median
0,Bitbase - Whitepaper,0.471933,10,11043,62.88,7.35,False,False,False,​v bitbase​ ​whitepaper​ ​ new​ ​escrow​ ​bloc...,...,BitBase Token,BTBS,$0.18,0.000000e+00,1.420400e+04,100000000,--,"$18,152,576",0,1
1,Neural Protocol - Whitepaper,0.900510,15,6505,52.90,7.93,False,False,False,table of contents abstract introduction nrp to...,...,Neural Protocol,NRP,$0.00,1.074150e+05,0.000000e+00,--,9500000000,"$107,437",0,0
2,Idena - Whitepaper,0.839485,14,16993,42.61,8.20,True,False,False,idena concept paper idena concept paper draft ...,...,Idena,IDNA,$0.01,7.561350e+05,2.432000e+04,--,92458524,"$1,121,725",1,1
3,LiveStars - Whitepaper,9.600061,25,31355,53.10,7.40,False,False,False,w t p a p r november version new generation we...,...,Live Stars,LIVE,$0.00,0.000000e+00,0.000000e+00,--,54722996,- -,0,0
4,0chain - Whitepaper,3.299664,19,66676,49.04,6.88,True,False,False,chain decentralizing storage saswata basu tom ...,...,0Chain,ZCN,$0.18,8.702623e+06,9.034700e+04,400000000,200000000,"$71,921,043",1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,Cartesi - Whitepaper,0.491359,16,86751,55.13,6.96,True,False,True,version the core of cartesi augusto teixeira d...,...,Cartesi,CTSI,$0.10,6.410724e+07,4.180223e+06,1000000000,1000000000,"$99,072,300",1,1
1696,BitCoen - Whitepaper,0.236475,27,38798,44.03,7.58,False,False,False,bitcoen white paper contents short description...,...,Bitcoin,BTC,"$16,174.04",3.107533e+11,2.723213e+10,21000000,19213087,"$339,654,858,021",1,1
1697,Marginless - Whitepaper,4.778409,36,47039,50.57,7.02,False,False,False,social betting networkpowered by blockchain ta...,...,Marginless,MRS,Nan,0.000000e+00,0.000000e+00,,,,0,0
1698,Cryptonex - Whitepaper,1.225157,17,31894,53.71,7.86,False,False,False,white paper introduccidon blockchain review ma...,...,Cryptonex,CNX,Nan,0.000000e+00,0.000000e+00,,,,0,0


In [84]:
metadata = all_df.iloc[:, 1:9]
metadata.iloc[:, 5:] = metadata.iloc[:, 5:]*1

In [53]:
X_train, X_test, y_train, y_test = train_test_split(metadata.iloc[:1200, :],all_df.loc[:1199, 'success_marketcap_median'])

In [54]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

In [89]:
model = tf.keras.Sequential()

model.add(Conv1D(32, kernel_size = 3, activation = 'relu', input_shape = (8,1)))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size = 3, activation = 'relu', input_shape = (1,)))
model.add(MaxPooling1D())
model.add(Dropout(0.2))

# model.add(Conv1D(32, kernel_size = 3, activation = 'relu', input_shape = (1,)))
model.add(Dense(16, activation = 'relu'))
model.add(MaxPooling1D())
model.add(Dropout(0.2))

model.add(Flatten())

model.add(Dense(1, activation= 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics= METRICS)

In [90]:
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_21 (Conv1D)          (None, 6, 32)             128       
                                                                 
 dropout_25 (Dropout)        (None, 6, 32)             0         
                                                                 
 conv1d_22 (Conv1D)          (None, 4, 64)             6208      
                                                                 
 max_pooling1d_14 (MaxPoolin  (None, 2, 64)            0         
 g1D)                                                            
                                                                 
 dropout_26 (Dropout)        (None, 2, 64)             0         
                                                                 
 dense_15 (Dense)            (None, 2, 16)             1040      
                                                     

In [91]:
metadata

Unnamed: 0,File Size (MB),Number of Pages,Word Count,Flesch Reading Ease,New Dale-Chall,Abstract Section,Summary Section,Reference Section
0,0.471933,10,11043,62.88,7.35,0,0,0
1,0.900510,15,6505,52.90,7.93,0,0,0
2,0.839485,14,16993,42.61,8.20,1,0,0
3,9.600061,25,31355,53.10,7.40,0,0,0
4,3.299664,19,66676,49.04,6.88,1,0,0
...,...,...,...,...,...,...,...,...
1695,0.491359,16,86751,55.13,6.96,1,0,1
1696,0.236475,27,38798,44.03,7.58,0,0,0
1697,4.778409,36,47039,50.57,7.02,0,0,0
1698,1.225157,17,31894,53.71,7.86,0,0,0


In [82]:
metadata = np.expand_dims(metadata, axis=2)

In [101]:
for i in range(3):
    X_train, X_test, y_train, y_test = train_test_split(metadata.iloc[:1200, :],all_df.loc[:1199, 'success_marketcap_median'])
    model.fit(X_train, y_train, epochs=3)
    model.evaluate(X_test, y_test)
    i+=1

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [102]:
y_predicted = model.predict(np.array(metadata.iloc[1200:, :]))
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5, 1, 0)
y_predicted
cm = confusion_matrix(df.loc[1200:, 'success_marketcap_median'], y_predicted)
cm



array([[254,   0],
       [246,   0]])

In [103]:
print(classification_report(df.loc[1200:, 'success_marketcap_median'], y_predicted))

              precision    recall  f1-score   support

           0       0.51      1.00      0.67       254
           1       0.00      0.00      0.00       246

    accuracy                           0.51       500
   macro avg       0.25      0.50      0.34       500
weighted avg       0.26      0.51      0.34       500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
