In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
import warnings
import spacy
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from collections import Counter
from sklearn.preprocessing import Normalizer
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("labelled_crypto_data.csv")
#label encoding
df['label'] = df['Sentiment'].map({
    'Bullish' : 0, 
    'Neutral': 1, 
    'Bearish': 2, 
})
df = df.drop(["date","Sentiment"],axis = 1)
df.head()

Unnamed: 0,news,label
0,bitcoin priced usd mt goxabove gox price pm gm...,1
1,bitcoin price fib level dmas ok last week like...,0
2,national australia bank loom melbourne citizen...,1
3,recent report china indicate may bitcoin ban m...,0
4,opened st bitcoin trade position price current...,1


In [3]:
# Vectorisation using Word2Vec
nlp = spacy.load("en_core_web_lg")
df['vector'] = df['news'].apply(lambda text: nlp(text).vector)

In [5]:
df.head()
df.to_csv("vectorised_output.csv",index = False)

In [6]:
# Oversampling using smote
X = np.stack(df['vector'])
y = df['label']
# Check the class distribution before oversampling
print("Class distribution before oversampling:", Counter(df["label"]))

smote = SMOTE(random_state=42)
X_re,Y_re = smote.fit_resample(X, y)

# Check the class distribution after oversampling
print("Class distribution after oversampling:", Counter(Y_re))

Class distribution before oversampling: Counter({1: 22276, 0: 14586, 2: 651})
Class distribution after oversampling: Counter({1: 22276, 0: 22276, 2: 22276})


In [7]:
# Hyperparameter tuning and using cross validation
model = RandomForestClassifier()
params = {
    'n_estimators': [10, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],  
}
grid_search = GridSearchCV(model, params, cv=5, return_train_score=False,n_jobs=-1)
grid_search.fit(X_re, Y_re)
print('params',grid_search.best_params_)
print('score',grid_search.best_score_)

params {'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}
score 0.8365975257364686


In [8]:
# saving the best model to pickle
import pickle
model = grid_search.best_estimator_
model_filename = 'crypto_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)