In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
%matplotlib inline
import warnings
from collections import Counter
from imblearn.over_sampling import ADASYN
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("labelled_crypto_data.csv")

In [3]:
df.head()

Unnamed: 0,news,date,Sentiment
0,bitcoin priced usd mt goxabove gox price pm gm...,2017-08-15,Neutral
1,bitcoin price fib level dmas ok last week like...,2017-05-12,Bullish
2,national australia bank loom melbourne citizen...,2017-09-19,Neutral
3,recent report china indicate may bitcoin ban m...,2014-10-03,Bullish
4,opened st bitcoin trade position price current...,2016-01-15,Neutral


In [4]:
#label encoding
df['label'] = df['Sentiment'].map({
    'Bullish' : 0, 
    'Neutral': 1, 
    'Bearish': 2, 
})
df.sample(5)

Unnamed: 0,news,date,Sentiment,label
22210,new research found large majority u k company ...,2017-11-23,Neutral,1
4183,sberbank may household name west happens third...,2017-12-05,Bullish,0
8355,according national security human right lawyer...,2017-09-25,Neutral,1
21346,earlier today december bitcoin price achieved ...,2015-06-03,Bullish,0
35560,bitcoin altcoins trend turning bearish,2020-01-10,Neutral,1


In [5]:
df = df.drop(["date","Sentiment"],axis = 1)

In [6]:
df.head()

Unnamed: 0,news,label
0,bitcoin priced usd mt goxabove gox price pm gm...,1
1,bitcoin price fib level dmas ok last week like...,0
2,national australia bank loom melbourne citizen...,1
3,recent report china indicate may bitcoin ban m...,0
4,opened st bitcoin trade position price current...,1


### Tfidf vectorizer & SMOTE is used on all models 

In [11]:
# Check the class distribution before oversampling
print("Class distribution before oversampling:", Counter(df['label']))

X = df['news']
y = df['label']

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(X)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(tfidf_features, y)

# Check the class distribution after oversampling
print("Class distribution after oversampling:", Counter(y_resampled))

Class distribution before oversampling: Counter({1: 22276, 0: 14586, 2: 651})
Class distribution after oversampling: Counter({1: 22276, 0: 22276, 2: 22276})


In [13]:
# Train_Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=y_resampled
)

# Decision Tree

In [14]:
# Create a Decision tree model and printing classification report
decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

classification_rep = classification_report(y_test, y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.73      0.72      0.72      4455
           1       0.72      0.70      0.71      4456
           2       0.95      0.98      0.96      4455

    accuracy                           0.80     13366
   macro avg       0.80      0.80      0.80     13366
weighted avg       0.80      0.80      0.80     13366



# Random Forest 

In [15]:
# Create a random forest model and printing classification report
random_forest = RandomForestClassifier()

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

classification_rep = classification_report(y_test, y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.74      0.91      0.82      4455
           1       0.88      0.69      0.77      4456
           2       1.00      1.00      1.00      4455

    accuracy                           0.86     13366
   macro avg       0.87      0.86      0.86     13366
weighted avg       0.87      0.86      0.86     13366



# K nearest neighbour

In [16]:
# Create a KNN classifier and printing classification report
knn_classifier = KNeighborsClassifier()

knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)

classification_rep = classification_report(y_test, y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.67      0.88      0.76      4455
           1       0.92      0.34      0.49      4456
           2       0.76      1.00      0.86      4455

    accuracy                           0.74     13366
   macro avg       0.78      0.74      0.70     13366
weighted avg       0.78      0.74      0.70     13366



# AdaBoostClassifier

In [17]:
# Create a AdaBoostClassifier and printing classification report
ada_boost = AdaBoostClassifier()

ada_boost.fit(X_train, y_train)
y_pred = ada_boost.predict(X_test)

classification_rep = classification_report(y_test, y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.65      0.66      0.66      4455
           1       0.53      0.61      0.57      4456
           2       0.81      0.68      0.74      4455

    accuracy                           0.65     13366
   macro avg       0.66      0.65      0.65     13366
weighted avg       0.66      0.65      0.65     13366



# GradientBoostingClassifier

In [18]:
# Create a GradientBoostingClassifier and printing classification report
gradient = GradientBoostingClassifier()

gradient.fit(X_train, y_train)
y_pred = gradient.predict(X_test)

classification_rep = classification_report(y_test, y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.72      0.75      0.74      4455
           1       0.70      0.66      0.68      4456
           2       0.89      0.90      0.90      4455

    accuracy                           0.77     13366
   macro avg       0.77      0.77      0.77     13366
weighted avg       0.77      0.77      0.77     13366

