In [1]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
# Data Manipulation Libraries
import pandas as pd
import numpy as np

# Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.cluster import KMeans, AgglomerativeClustering

# Natural Language Processing Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob

# Deep Learning Libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import torch
import torch.nn as nn

# Additional Utilities
from scipy.stats import pearsonr, spearmanr

# Checking NLTK setup and downloaded resources
nltk.download('popular')

# Setup for plots
sns.set(style="whitegrid")

# Print statement to confirm successful imports
print("All libraries imported successfully!")


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/datahub/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/datahub/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/datahub/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/datahub/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/datahub/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/datahub/nltk_data...
[nltk_data]    |   Package movie_reviews is already

All libraries imported successfully!


[nltk_data]    |   Package punkt is already up-to-date!
[nltk_data]    | Downloading package snowball_data to
[nltk_data]    |     /home/datahub/nltk_data...
[nltk_data]    |   Package snowball_data is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/datahub/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | 
[nltk_data]  Done downloading collection popular


In [6]:
base_path = '/files/Project/'

# Load cryptocurrency data, skipping the first line and renaming the columns
def load_crypto_data(file_path):
    df = pd.read_csv(file_path, delimiter=';', decimal=',', skiprows=1)
    df.rename(columns={df.columns[0]: 'DateTime', df.columns[1]: 'Close'}, inplace=True)
    return df

# Load Telegram data, skipping the first column
def load_telegram_data(file_path):
    df = pd.read_csv(file_path, delimiter=';', decimal=',')
    df.drop(df.columns[0], axis=1, inplace=True)  # Remove the first column
    return df

# Combine the first and last rows into a single DataFrame
def combine_head_tail(df, num_rows=5):
    return pd.concat([df.head(num_rows), df.tail(num_rows)])

try:
    ada_df = load_crypto_data(base_path + 'ADA_Cardano.csv')
    bch_df = load_crypto_data(base_path + 'BCH_Bitcoin_cash.csv')
    btc_df = load_crypto_data(base_path + 'BTC_Bitcoin.csv')
    eth_df = load_crypto_data(base_path + 'ETH_Ethereum.csv')
    ltc_df = load_crypto_data(base_path + 'LTC_Litecoin.csv')
    xrp_df = load_crypto_data(base_path + 'XRP_Ripple.csv')
    telegram_df = load_telegram_data(base_path + 'Telegram_sentiment.csv')

    # Display combined data for each cryptocurrency
    print("ADA Cardano - Data overview:")
    display(combine_head_tail(ada_df))
    print("\nBCH Bitcoin Cash - Data overview:")
    display(combine_head_tail(bch_df))
    print("\nBTC Bitcoin - Data overview:")
    display(combine_head_tail(btc_df))
    print("\nETH Ethereum - Data overview:")
    display(combine_head_tail(eth_df))
    print("\nLTC Litecoin - Data overview:")
    display(combine_head_tail(ltc_df))
    print("\nXRP Ripple - Data overview:")
    display(combine_head_tail(xrp_df))
    
    # Display combined data for Telegram Sentiment
    print("\nTelegram Sentiment - Data overview:")
    display(combine_head_tail(telegram_df))
    
except pd.errors.ParserError as e:
    print("A parsing error occurred:", e)
except FileNotFoundError as e:
    print("File not found:", e)
except Exception as e:
    print("An unexpected error occurred:", e)

ADA Cardano - Data overview:


Unnamed: 0,DateTime,Close
0,11/04/2024 13:00,0.5835
1,11/04/2024 12:00,0.5862
2,11/04/2024 11:00,0.5813
3,11/04/2024 10:00,0.5871
4,11/04/2024 09:00,0.5884
8779,11/04/2023 17:00,0.402
8780,11/04/2023 16:00,0.4003
8781,11/04/2023 15:00,0.4055
8782,11/04/2023 14:00,0.404
8783,11/04/2023 13:00,0.4084



BCH Bitcoin Cash - Data overview:


Unnamed: 0,DateTime,Close
0,11/04/2024 13:00,602.88
1,11/04/2024 12:00,609.78
2,11/04/2024 11:00,605.63
3,11/04/2024 10:00,611.0
4,11/04/2024 09:00,614.02
8289,11/04/2023 18:00,129.63
8290,11/04/2023 17:00,129.36
8291,11/04/2023 16:00,129.73
8292,11/04/2023 15:00,128.97
8293,11/04/2023 14:00,128.33



BTC Bitcoin - Data overview:


Unnamed: 0,DateTime,Close
0,11/04/2024 13:00,70729.985
1,11/04/2024 12:00,70856.45
2,11/04/2024 11:00,70000.29
3,11/04/2024 10:00,70438.005
4,11/04/2024 09:00,70597.405
8753,11/04/2023 17:00,30234.4575
8754,11/04/2023 16:00,30154.848
8755,11/04/2023 15:00,30225.8525
8756,11/04/2023 14:00,30086.3725
8757,11/04/2023 13:00,30268.725



ETH Ethereum - Data overview:


Unnamed: 0,DateTime,Close
0,11/04/2024 13:00,3556.0
1,11/04/2024 12:00,3557.6
2,11/04/2024 11:00,3522.23
3,11/04/2024 10:00,3560.9
4,11/04/2024 09:00,3578.1
8763,11/04/2023 17:00,1907.8
8764,11/04/2023 16:00,1903.45
8765,11/04/2023 15:00,1915.31
8766,11/04/2023 14:00,1911.7
8767,11/04/2023 13:00,1921.8



LTC Litecoin - Data overview:


Unnamed: 0,DateTime,Close
0,11/04/2024 13:00,95.54
1,11/04/2024 12:00,96.75
2,11/04/2024 11:00,95.96
3,11/04/2024 10:00,96.6
4,11/04/2024 09:00,96.97
8764,11/04/2023 17:00,94.92
8765,11/04/2023 16:00,94.93
8766,11/04/2023 15:00,95.3
8767,11/04/2023 14:00,94.83
8768,11/04/2023 13:00,95.25



XRP Ripple - Data overview:


Unnamed: 0,DateTime,Close
0,11/04/2024 13:00,0.60726
1,11/04/2024 12:00,0.61397
2,11/04/2024 11:00,0.60916
3,11/04/2024 10:00,0.61388
4,11/04/2024 09:00,0.61599
8763,11/04/2023 17:00,0.51457
8764,11/04/2023 16:00,0.51409
8765,11/04/2023 15:00,0.51959
8766,11/04/2023 14:00,0.51857
8767,11/04/2023 13:00,0.52324



Telegram Sentiment - Data overview:


Unnamed: 0,channel,id,text,date,views,scores,compound,sentiment_type
0,binancesignals,1382,bitcoin market cap surpasses 13 trillion,04/03/2024 16:33,6319,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL
1,binancesignals,1381,update atausdt long leverage cross x10 smashed...,04/03/2024 11:48,6847,"{'neg': 0.0, 'neu': 0.751, 'pos': 0.249, 'comp...",0.5106,POSITIVE
2,binancesignals,1380,altcoins started make move bitcoin total marke...,03/03/2024 19:53,8066,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,NEUTRAL
3,binancesignals,1379,coin maticusdt timeframe 1d observation broken...,03/03/2024 17:52,7740,"{'neg': 0.129, 'neu': 0.871, 'pos': 0.0, 'comp...",-0.4767,NEGATIVE
4,binancesignals,1378,coin aptusdt timeframe 1d observation broken m...,03/03/2024 17:50,7668,"{'neg': 0.154, 'neu': 0.846, 'pos': 0.0, 'comp...",-0.4767,NEGATIVE
14707,wolfoftrading,1184,ethusdt 1hr could possibly see continuation do...,04/12/2022 19:32,8927,"{'neg': 0.159, 'neu': 0.771, 'pos': 0.07, 'com...",-0.4215,NEGATIVE
14708,wolfoftrading,1183,egldusdt 1hr reaching first target broke raisi...,04/12/2022 19:14,8597,"{'neg': 0.241, 'neu': 0.603, 'pos': 0.155, 'co...",-0.25,NEGATIVE
14709,wolfoftrading,1182,first target reached 40 x20 leverage,03/12/2022 17:01,8540,"{'neg': 0.0, 'neu': 0.781, 'pos': 0.219, 'comp...",0.1027,POSITIVE
14710,wolfoftrading,1180,egldusdt formed adam eve bullish pattern plus ...,03/12/2022 12:03,8450,"{'neg': 0.0, 'neu': 0.884, 'pos': 0.116, 'comp...",0.4404,POSITIVE
14711,wolfoftrading,1179,hello wolf first december week went profitable...,02/12/2022 14:16,8310,"{'neg': 0.0, 'neu': 0.674, 'pos': 0.326, 'comp...",0.4404,POSITIVE


In [7]:
#Step 1: Setup and Data Preprocessing

In [8]:
!pip install transformers torch nltk pandas numpy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig
import torch
from torch.utils.data import Dataset, DataLoader

# Downloading necessary datasets for nltk
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/datahub/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/datahub/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
#Step 2: Data Cleaning

In [12]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def clean_text(df, text_field):
    # Ensure stopwords and lemmatizer are ready to be used
    stop = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    
    # Convert all entries in the text field to string and handle NaN values
    df[text_field] = df[text_field].fillna('').astype(str)
    
    # Lowercasing, removing stopwords, and lemmatization
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(
        lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in stop])
    )
    
    # Removing special characters, ensuring to set regex=True for compatibility
    df[text_field] = df[text_field].str.replace('[^\w\s]', '', regex=True)

    return df

# Use the function on your DataFrame
telegram_df = clean_text(telegram_df, 'text')  # Using the actual name of the text column


In [13]:
#Step 3: Sentiment Analysis and Aspect Extraction

In [15]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [16]:
#Load the pre-trained model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [17]:
#Define the sentiment analysis function
def sentiment_analysis(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.logits.argmax(-1).item()  

In [18]:
telegram_df['sentiment'] = telegram_df['text'].apply(sentiment_analysis)

In [19]:
print(telegram_df.head())

          channel    id                                               text  \
0  binancesignals  1382           bitcoin market cap surpasses 13 trillion   
1  binancesignals  1381  update atausdt long leverage cross x10 smashed...   
2  binancesignals  1380  altcoins started make move bitcoin total marke...   
3  binancesignals  1379  coin maticusdt timeframe 1d observation broken...   
4  binancesignals  1378  coin aptusdt timeframe 1d observation broken m...   

               date  views                                             scores  \
0  04/03/2024 16:33   6319  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...   
1  04/03/2024 11:48   6847  {'neg': 0.0, 'neu': 0.751, 'pos': 0.249, 'comp...   
2  03/03/2024 19:53   8066  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...   
3  03/03/2024 17:52   7740  {'neg': 0.129, 'neu': 0.871, 'pos': 0.0, 'comp...   
4  03/03/2024 17:50   7668  {'neg': 0.154, 'neu': 0.846, 'pos': 0.0, 'comp...   

   compound sentiment_type  sentiment  
0   

In [25]:
# We define a dictionary to map the scores to textual labels
sentiment_labels = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}

# Next, we use 'map' with this dictionary to create the new column
telegram_df['sentiment_label'] = telegram_df['sentiment'].map(sentiment_labels)

print(telegram_df.head())

telegram_df.to_csv("sentiment_analysis_with_labels.csv", index=False)


          channel    id                                               text  \
0  binancesignals  1382           bitcoin market cap surpasses 13 trillion   
1  binancesignals  1381  update atausdt long leverage cross x10 smashed...   
2  binancesignals  1380  altcoins started make move bitcoin total marke...   
3  binancesignals  1379  coin maticusdt timeframe 1d observation broken...   
4  binancesignals  1378  coin aptusdt timeframe 1d observation broken m...   

               date  views                                             scores  \
0  04/03/2024 16:33   6319  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...   
1  04/03/2024 11:48   6847  {'neg': 0.0, 'neu': 0.751, 'pos': 0.249, 'comp...   
2  03/03/2024 19:53   8066  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...   
3  03/03/2024 17:52   7740  {'neg': 0.129, 'neu': 0.871, 'pos': 0.0, 'comp...   
4  03/03/2024 17:50   7668  {'neg': 0.154, 'neu': 0.846, 'pos': 0.0, 'comp...   

   compound sentiment_type  sentiment  misma

In [26]:
# Create a 'mismatch' column that is True when there is a discrepancy between the two columns
telegram_df['mismatch'] = telegram_df['sentiment_type'].ne(telegram_df['sentiment_label'])

# Count the total number of mismatches
mismatches = telegram_df['mismatch'].sum()

# Calculate the percentage of mismatches
total_entries = len(telegram_df)
percentage_mismatches = (mismatches / total_entries) * 100

# Display the percentage of mismatches
print(f"The percentage of mismatches is {percentage_mismatches:.2f}%.")



The percentage of mismatches is 58.64%.


In [27]:
# Filter the DataFrame to get only the rows where there is a "major error"
major_errors_df = telegram_df[
    (telegram_df['sentiment_type'] == "POSITIVE") & (telegram_df['sentiment_label'] == "NEGATIVE") |
    (telegram_df['sentiment_type'] == "NEGATIVE") & (telegram_df['sentiment_label'] == "POSITIVE")
]

# Count the number of "major errors"
number_major_errors = len(major_errors_df)

# Calculate the percentage of "major errors"
percentage_major_errors = (number_major_errors / total_entries) * 100

# Display the percentage of "major errors"
print(f"The percentage of 'major errors' is {percentage_major_errors:.2f}%.")


The percentage of 'major errors' is 1.78%.
