# Feature Engineering

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import lightgbm as lgb
import numpy as np

In [9]:
df = pd.read_csv("labelled_crypto_data.csv")
df.head()

Unnamed: 0,news,date,Sentiment
0,bitcoin priced usd mt goxabove gox price pm gm...,2017-08-15,Neutral
1,bitcoin price fib level dmas ok last week like...,2017-05-12,Bullish
2,national australia bank loom melbourne citizen...,2017-09-19,Neutral
3,recent report china indicate may bitcoin ban m...,2014-10-03,Bullish
4,opened st bitcoin trade position price current...,2016-01-15,Neutral


In [10]:
# Assuming you have a DataFrame named df
missing_values = df.isnull().sum()

# Display missing values count for each column
print(missing_values)

news         0
date         0
Sentiment    0
dtype: int64


In [11]:
df['label'] = df['Sentiment'].map({
    'Bullish' : 0, 
    'Neutral': 1, 
    'Bearish': 2, 
})

#checking the results 
df.head(5)

Unnamed: 0,news,date,Sentiment,label
0,bitcoin priced usd mt goxabove gox price pm gm...,2017-08-15,Neutral,1
1,bitcoin price fib level dmas ok last week like...,2017-05-12,Bullish,0
2,national australia bank loom melbourne citizen...,2017-09-19,Neutral,1
3,recent report china indicate may bitcoin ban m...,2014-10-03,Bullish,0
4,opened st bitcoin trade position price current...,2016-01-15,Neutral,1


In [12]:
#check the distribution of labels 
df['label'].value_counts()

1    22276
0    14586
2      651
Name: label, dtype: int64

<h3>Train test split</h3>

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df.news, 
    df.label, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label
)

In [14]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (30010,)
Shape of X_test:  (7503,)


# TF-IDF Vectorisation

In [15]:
#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('KNN', KNeighborsClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.67      0.66      2917
           1       0.77      0.77      0.77      4456
           2       0.47      0.14      0.21       130

    accuracy                           0.72      7503
   macro avg       0.63      0.53      0.55      7503
weighted avg       0.72      0.72      0.72      7503



# Spacy Word2vec

In [16]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [17]:
#This will take some time(nearly 15 minutes)
df['vector'] = df['news'].apply(lambda text: nlp(text).vector)  

In [18]:
df.head()

Unnamed: 0,news,date,Sentiment,label,vector
0,bitcoin priced usd mt goxabove gox price pm gm...,2017-08-15,Neutral,1,"[0.16266258, 0.648594, -1.0985134, 0.7211839, ..."
1,bitcoin price fib level dmas ok last week like...,2017-05-12,Bullish,0,"[0.06778961, 1.2555737, -1.7561576, 0.39234045..."
2,national australia bank loom melbourne citizen...,2017-09-19,Neutral,1,"[-0.6519221, 0.14509854, -1.2072718, 0.0738156..."
3,recent report china indicate may bitcoin ban m...,2014-10-03,Bullish,0,"[-0.23611419, 0.38005897, -1.9225764, 0.769365..."
4,opened st bitcoin trade position price current...,2016-01-15,Neutral,1,"[-0.15200873, 0.6458844, -1.0060012, 0.5816615..."


In [19]:

X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.label,
    test_size=0.2,
    random_state=2022
)

In [20]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf = KNeighborsClassifier()
clf.fit(scaled_train_embed, y_train)

In [22]:
y_pred = clf.predict(scaled_test_embed)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.68      0.63      2917
           1       0.75      0.70      0.73      4452
           2       0.67      0.09      0.16       134

    accuracy                           0.68      7503
   macro avg       0.67      0.49      0.50      7503
weighted avg       0.69      0.68      0.68      7503



# Gensim

In [23]:
import gensim.downloader as api

In [24]:
wv = api.load('word2vec-google-news-300')

In [None]:
def vectorize(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        filtered_tokens.append(token.lemma_)  
    return wv.get_mean_vector(filtered_tokens)

df['vector'] = df['news'].apply(lambda text: vectorize(text))

In [None]:
df.head()

In [None]:

#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values, 
    df.label, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label
)

In [None]:
print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)


X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

In [None]:
#1. creating a GradientBoosting model object
clf = KNeighborsClassifier()

#2. fit with all_train_embeddings and y_train
clf.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))