# Feature Engineering

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import spacy
import lightgbm as lgb
import numpy as np
from collections import Counter

In [3]:
df = pd.read_csv("labelled_crypto_data.csv")
df.head()

Unnamed: 0,news,date,Sentiment
0,bitcoin priced usd mt goxabove gox price pm gm...,2017-08-15,Neutral
1,bitcoin price fib level dmas ok last week like...,2017-05-12,Bullish
2,national australia bank loom melbourne citizen...,2017-09-19,Neutral
3,recent report china indicate may bitcoin ban m...,2014-10-03,Bullish
4,opened st bitcoin trade position price current...,2016-01-15,Neutral


In [4]:
missing_values = df.isnull().sum()

# Display missing values count for each column
print(missing_values)

news         0
date         0
Sentiment    0
dtype: int64


In [5]:
df['label'] = df['Sentiment'].map({
    'Bullish' : 0, 
    'Neutral': 1, 
    'Bearish': 2, 
})

#checking the results 
df.head(5)

Unnamed: 0,news,date,Sentiment,label
0,bitcoin priced usd mt goxabove gox price pm gm...,2017-08-15,Neutral,1
1,bitcoin price fib level dmas ok last week like...,2017-05-12,Bullish,0
2,national australia bank loom melbourne citizen...,2017-09-19,Neutral,1
3,recent report china indicate may bitcoin ban m...,2014-10-03,Bullish,0
4,opened st bitcoin trade position price current...,2016-01-15,Neutral,1


In [6]:
#check the distribution of labels 
df['label'].value_counts()

1    22276
0    14586
2      651
Name: label, dtype: int64

# TF-IDF Vectorisation

In [7]:
# Check the class distribution before oversampling
print("Class distribution before oversampling:", Counter(df['label']))

X = df['news']
y = df['label']

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(X)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(tfidf_features, y)

# Check the class distribution after oversampling
print("Class distribution after oversampling:", Counter(y_resampled))

Class distribution before oversampling: Counter({1: 22276, 0: 14586, 2: 651})
Class distribution after oversampling: Counter({1: 22276, 0: 22276, 2: 22276})


In [8]:
# Train_Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=y_resampled
)

In [9]:
# Create a random forest model and printing classification report
random_forest = RandomForestClassifier()

random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

classification_rep = classification_report(y_test, y_pred)
print(classification_rep)

              precision    recall  f1-score   support

           0       0.74      0.90      0.81      4455
           1       0.87      0.68      0.77      4456
           2       1.00      1.00      1.00      4455

    accuracy                           0.86     13366
   macro avg       0.87      0.86      0.86     13366
weighted avg       0.87      0.86      0.86     13366



# Spacy Word2vec

In [6]:
nlp = spacy.load("en_core_web_lg")

In [7]:
df['vector'] = df['news'].apply(lambda text: nlp(text).vector)  

In [8]:
X = np.stack(df['vector'])
y = df['label']

In [9]:
df.head()

Unnamed: 0,news,date,Sentiment,label,vector
0,bitcoin priced usd mt goxabove gox price pm gm...,2017-08-15,Neutral,1,"[0.16266258, 0.648594, -1.0985134, 0.7211839, ..."
1,bitcoin price fib level dmas ok last week like...,2017-05-12,Bullish,0,"[0.06778961, 1.2555737, -1.7561576, 0.39234045..."
2,national australia bank loom melbourne citizen...,2017-09-19,Neutral,1,"[-0.6519221, 0.14509854, -1.2072718, 0.0738156..."
3,recent report china indicate may bitcoin ban m...,2014-10-03,Bullish,0,"[-0.23611419, 0.38005897, -1.9225764, 0.769365..."
4,opened st bitcoin trade position price current...,2016-01-15,Neutral,1,"[-0.15200873, 0.6458844, -1.0060012, 0.5816615..."


In [14]:
# Check the class distribution before oversampling
print("Class distribution before oversampling:", Counter(y))

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after oversampling
print("Class distribution after oversampling:", Counter(y_resampled))

Class distribution before oversampling: Counter({1: 22276, 0: 14586, 2: 651})
Class distribution after oversampling: Counter({1: 22276, 0: 22276, 2: 22276})


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.2,
    random_state=2022
)

In [16]:
# Using Min-max scaler
scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train)
scaled_test_embed = scaler.transform(X_test)

clf = RandomForestClassifier()
clf.fit(scaled_train_embed, y_train)

In [17]:
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.86      0.82      4444
           1       0.85      0.75      0.79      4495
           2       0.98      1.00      0.99      4427

    accuracy                           0.87     13366
   macro avg       0.87      0.87      0.87     13366
weighted avg       0.87      0.87      0.87     13366



# Gensim

In [18]:
import gensim.downloader as api

In [20]:
wv = api.load('word2vec-google-news-300')

In [21]:
def vectorize(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        filtered_tokens.append(token.lemma_)  
    return wv.get_mean_vector(filtered_tokens)

df['vector'] = df['news'].apply(lambda text: vectorize(text))

In [22]:
X = np.stack(df['vector'])
y = df['label']

In [23]:
# Check the class distribution before oversampling
print("Class distribution before oversampling:", Counter(y))

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after oversampling
print("Class distribution after oversampling:", Counter(y_resampled))

Class distribution before oversampling: Counter({1: 22276, 0: 14586, 2: 651})
Class distribution after oversampling: Counter({1: 22276, 0: 22276, 2: 22276})


In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, 
    y_resampled, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=y_resampled
)

In [25]:
# Using Min-max scaler
scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train)
scaled_test_embed = scaler.transform(X_test)

clf = RandomForestClassifier()
clf.fit(scaled_train_embed, y_train)

y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.87      0.82      4455
           1       0.85      0.74      0.79      4456
           2       0.99      1.00      0.99      4455

    accuracy                           0.87     13366
   macro avg       0.87      0.87      0.87     13366
weighted avg       0.87      0.87      0.87     13366

