In [1]:
import os
print(os.getcwd())

c:\Users\aj281\OneDrive\Desktop\crypto sentiment analysis\crypto_sentiment_trading\src


In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("../data/chunked_data/tweets_part_1.csv")
print(df.shape)
print(df.head())

(50000, 13)
                                  user_name    user_location  \
0                             DeSota Wilson      Atlanta, GA   
1                                  CryptoND              NaN   
2                                 Tdlmatias  London, England   
3                      Crypto is the future              NaN   
4  Alex Kirchmaier 🇦🇹🇸🇪 #FactsSuperspreader           Europa   

                                    user_description         user_created  \
0  Biz Consultant, real estate, fintech, startups...  2009-04-26 20:05:09   
1  😎 BITCOINLIVE is a Dutch platform aimed at inf...  2019-10-17 20:12:10   
2  IM Academy : The best #forex, #SelfEducation, ...  2014-11-10 10:50:37   
3  I will post a lot of buying signals for BTC tr...  2019-09-28 16:48:12   
4  Co-founder @RENJERJerky | Forbes 30Under30 | I...  2016-02-03 13:15:55   

   user_followers  user_friends  user_favourites  user_verified  \
0          8534.0          7605             4838          False   
1     

In [4]:
print("Columns in the dataset")
print(df.columns)
columns_to_drop = [
    'user_name', 'user_location', 'user_description', 'user_created',
    'user_followers', 'user_friends', 'user_favourites', 'user_verified',
    'source'
] 
#remove the columns which are not needed for the analysis

df = df.drop(columns=columns_to_drop)
print("Columns after dropping some columns")
print(df.columns)
print("New shape of the dataset ",df.shape)

Columns in the dataset
Index(['user_name', 'user_location', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date', 'text', 'hashtags', 'source', 'is_retweet'],
      dtype='object')
Columns after dropping some columns
Index(['date', 'text', 'hashtags', 'is_retweet'], dtype='object')
New shape of the dataset  (50000, 4)


In [5]:
df = df[df['is_retweet'] == False].reset_index(drop=True)
print(df.shape)

# Deleting the duplicate tweets

(50000, 4)


In [6]:
print(df.isnull().sum())

date              0
text              0
hashtags      10571
is_retweet        0
dtype: int64


## Cleaning the data for VADER


In [7]:
import re

def preprocess_for_vader(text):
    # Remove URLs
    text = re.sub(r"http\S+|www.\S+", "", text)
    # Remove @mentions
    text = re.sub(r"@\w+", "", text)
    # Remove hashtags 
    text = re.sub(r"#", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [8]:
df['text'] = df['text'].apply(preprocess_for_vader)

print(df['text'].iloc[:10])

0    Blue Ridge Bank shares halted by NYSE after bi...
1    😎 Today, that's this Thursday, we will do a "🎬...
2    Guys evening, I have read this article about B...
3    $BTC A big chance in a billion! Price: \487264...
4    This network is secured by 9 508 nodes as of t...
5    💹 Trade Crypto on Binance 📌 Enjoy Cashback 10%...
6        &lt;'fire' &amp; 'man'&gt; Bitcoin Crypto BTC
7    🔄 Prices update in $EUR (1 hour): $BTC - 37082...
8    BTC Bitcoin Ethereum ETH Crypto cryptotrading ...
9    .’s bitcoin investment is revolutionary for cr...
Name: text, dtype: object


In [9]:
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Ensure reproducibility
DetectorFactory.seed = 0

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

# Apply language detection
df['lang'] = df['text'].astype(str).apply(detect_language)

# Show all unique languages detected
unique_languages = df['lang'].unique()

print("Unique languages detected in the dataset:")
print(unique_languages)


Unique languages detected in the dataset:
['en' 'tl' 'pt' 'it' 'es' 'fr' 'hu' 'nl' 'de' 'vi' 'ca' 'no' 'so' 'sv'
 'ro' 'th' 'fi' 'sl' 'af' 'tr' 'unknown' 'cy' 'id' 'da' 'sw' 'pl' 'hr'
 'sk' 'ja' 'sq' 'ru']


In [10]:
def count_language_frequencies(df, lang_column='lang'):
    return df[lang_column].value_counts()

# Assuming df['lang'] has language codes
lang_counts = count_language_frequencies(df)

print("Language frequencies in the dataset:")
print(lang_counts)


Language frequencies in the dataset:
lang
en         49246
de           149
it            76
nl            73
unknown       66
tl            56
fr            55
ca            35
vi            34
ro            25
pt            22
so            21
cy            20
hu            17
da            15
no            12
es            11
sv            11
af            11
tr            11
fi             8
id             8
sl             5
th             2
sw             2
pl             2
hr             2
sq             2
sk             1
ja             1
ru             1
Name: count, dtype: int64


In [11]:
def keep_only_english_tweets(df, lang_column='lang'):
    return df[df[lang_column] == 'en'].copy()

# Filtered DataFrame with only English tweets
english_df = keep_only_english_tweets(df)

# check how many remain
print(f"Number of English tweets: {len(english_df)}")


Number of English tweets: 49246


In [12]:
english_df.to_csv('filtered_tweets_1.csv', index=False)


## Assigning Labels to tweets Using VADER

In [14]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia=SentimentIntensityAnalyzer()

def get_sentiment_label(text):
    # Get the sentiment scores
    sentiment_scores = sia.polarity_scores(text)
    
    # Determine the sentiment label based on the compound score
    if sentiment_scores['compound'] >= 0.05:
        return 'positive'
    elif sentiment_scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aj281\AppData\Roaming\nltk_data...


In [15]:
df['sentiment'] = df['text'].apply(get_sentiment_label)
# Count the number of tweets in each sentiment category 
sentiment_counts = df['sentiment'].value_counts()  
print("Sentiment counts:")
print(sentiment_counts)

# Save the DataFrame with sentiment labels to a CSV file
df.to_csv('tweets_with_sentiment_1.csv', index=False)

Sentiment counts:
sentiment
neutral     21604
positive    21374
negative     7022
Name: count, dtype: int64
