In [None]:
# ! pip install vaderSentiment fastparquet spacy textacy pandas

In [4]:
# ! python3 -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import numpy as np

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textacy import preprocessing
import spacy

from datetime import datetime
import json
import re
from multiprocessing import Pool

In [None]:
nlp = spacy.load("en_core_web_sm")

In [6]:
data = pd.read_csv("./Reddit_scripts/filtered_reddit.csv")
reddit_data = data.to_parquet("./reddit_hour_raw.parquet.gzip")
data

Unnamed: 0,score,time,title,author,url,body,upvote,num_comments
0,0,2021-01-01 00:00:08,First Time Saved/ Made Money,u/[deleted],https://www.reddit.com/r/Bitcoin/comments/ko10...,[deleted],0.0,6.0
1,0,2021-01-01 00:02:58,Btc dip to 20k?,u/randum-guy,https://www.reddit.com/r/Bitcoin/comments/ko12...,Is it possible for Bitcoin to dip to 20k? My b...,0.0,19.0
2,119,2021-01-01 00:07:47,BTC just had the monthly and yearly close! 202...,u/Mari0805,https://www.reddit.com/r/Bitcoin/comments/ko15...,Let's see what 2021 brings us. I predict 2021 ...,119.0,29.0
3,0,2021-01-01 00:10:20,I believe in Bitcoin.,u/[deleted],https://www.reddit.com/r/Bitcoin/comments/ko17...,[deleted],0.0,5.0
4,1,2021-01-01 00:12:15,Please help me find a solution with my BTC wal...,u/[deleted],https://www.reddit.com/r/Bitcoin/comments/ko18...,[deleted],1.0,28.0
...,...,...,...,...,...,...,...,...
387271,1,2024-12-31 23:42:29,BTC Noob Here Looking for a Pep Talk,u/ExplorerNo3464,https://www.reddit.com/r/Bitcoin/comments/1hqq...,Never really considered crypto as an investmen...,1.0,13.0
387272,1,2024-12-31 23:47:15,Why can't I find a consistent All Time High li...,u/broccolihead,https://www.reddit.com/r/Bitcoin/comments/1hqr...,My title sums up my question. Why can't I fin...,1.0,5.0
387273,2,2024-12-31 23:49:33,Bitcoin node,u/woreoutmachinist,https://www.reddit.com/r/Bitcoin/comments/1hqr...,So I am running a node. I have one issue. I f...,2.0,9.0
387274,1,2024-12-31 23:57:17,I need your opinion,u/SupremeLuis_,https://www.reddit.com/r/Bitcoin/comments/1hqr...,If you had to sell your Bitcoin to max out you...,1.0,6.0


In [7]:
## Preprocessing
def text_preprocessing(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'www\S+', '', text, flags=re.MULTILINE)
    text = preprocessing.normalize.unicode(text, form="NFKD")

    # Standardize quotation marks and dashes
    text = preprocessing.normalize.quotation_marks(text)
    text = preprocessing.normalize.hyphenated_words(text)

    # # Convert to ASCII (removes non-ASCII characters)
    # text = preprocessing.remove.accents(text)
    doc = nlp(text)

    tokens = [token.text for token in doc]
    return tokens

In [None]:
REDDIT_PATH = "./reddit_hour_raw.parquet.gzip"

In [27]:
''''
reddit data:
"index": int,"date": str, "title": str,"author": str,"link": str, "content": str,

twitter:
"user_name": str,"user_location": str, "date": str,"text": str ,"hashtags": str
'''

sentiment_analyzer = SentimentIntensityAnalyzer()

def parquet_to_df(file_path):
    df = pd.read_parquet(file_path, engine='pyarrow')
    return df

def df_to_parquet(df,name):
    parquet = df.to_parquet(f'{name}.parquet.gzip',compression='gzip')

def getSentiment(text):
    return sentiment_analyzer.polarity_scores(text)['compound']

def vader_sentimentscore(df,media,flag):

    if media == "reddit":
        df['body'] = df['body'].replace(np.nan, '', regex=True)
        df['title'] = df['title'].replace(np.nan, '', regex=True)

        df["text_data"] = df["title"] + df["body"]
        if flag:
          with Pool() as pool:
            df['text_data'] = pool.map(text_preprocessing, df['text_data'])

        with Pool() as pool:
            df['sentiment'] = pool.map(getSentiment, df['text_data'])

        # df['sentiment'] = df['text_data'].apply(lambda x: sentiment_analyzer.polarity_scores(x)['compound'])
        print(f"Vader Overall Sentiment score for Reddit has been calculated")
    elif media == "twitter":
        if flag:
          with Pool() as pool:
            df['text_data'] = pool.map(text_preprocessing, df['text'])

        with Pool() as pool:
            df['sentiment'] = pool.map(getSentiment, df['text'])
        print(f"Vader Overall Sentiment score for Twitter has been calculated")
    return df

def reduce_to_date(value):
    return value.date()


def groupby_hour(df):
    # Create a new DataFrame with the hour extracted from the 'time' column
    new_df = pd.DataFrame()
    new_df["date"] = pd.to_datetime(df['time']).dt.strftime("%Y-%m-%d %H:00:00")
    new_df['sentiment'] = df['sentiment']
    
    # Group by hour and calculate average and total sentiment
    result = new_df.groupby('date').agg(
        total_sentiment=('sentiment', 'sum'),
        average_sentiment=('sentiment', 'mean')
    ).reset_index()
    
    return result

def convert_to_parquet(file):

    if file == "filtered_reddits.csv":
        data_type = {"index": int,"date": str , "title": str,"author": str,"link": str, "content": str}
        df = pd.read_csv(file,dtype=data_type)

    else:
        data_type = {"user_name": str,"user_location": str, "date": str,"text": str ,"hashtags": str}
        df = pd.read_csv(file,dtype=data_type)
        df = df.drop(columns=["user_description","user_created","user_followers", "user_friends", "user_favourites","user_verified","source","is_retweet" ])
    parquet = df.to_parquet(f'./{file[:-4]}.parquet.gzip',compression='gzip')
    return

# btc_dataframe = parquet_to_df("./BTCUSDT.parquet.gzip")


In [10]:
reddit_dataframe = parquet_to_df(REDDIT_PATH)
reddit_dataframe = vader_sentimentscore(reddit_dataframe,"reddit",True)


Vader Overall Sentiment score for Reddit has been calculated


In [28]:
result_reddit_df = groupby_hour(reddit_dataframe)
df_to_parquet(result_reddit_df,"reddit_sentiment_processed")
print(result_reddit_df)

                      date  total_sentiment  average_sentiment
0      2021-01-01 00:00:00           0.2966           0.015611
1      2021-01-01 01:00:00           1.6673           0.049038
2      2021-01-01 02:00:00           0.4588           0.021848
3      2021-01-01 03:00:00          -0.4329          -0.020614
4      2021-01-01 04:00:00           0.0000           0.000000
...                    ...              ...                ...
34857  2024-12-31 19:00:00           0.0000           0.000000
34858  2024-12-31 20:00:00           0.0000           0.000000
34859  2024-12-31 21:00:00           0.0000           0.000000
34860  2024-12-31 22:00:00           0.6312           0.028691
34861  2024-12-31 23:00:00           0.0000           0.000000

[34862 rows x 3 columns]


In [26]:
result_reddit_df

Unnamed: 0,date,total_sentiment,average_sentiment
0,2021-01-01 00,0.2966,0.015611
1,2021-01-01 01,1.6673,0.049038
2,2021-01-01 02,0.4588,0.021848
3,2021-01-01 03,-0.4329,-0.020614
4,2021-01-01 04,0.0000,0.000000
...,...,...,...
34857,2024-12-31 19,0.0000,0.000000
34858,2024-12-31 20,0.0000,0.000000
34859,2024-12-31 21,0.0000,0.000000
34860,2024-12-31 22,0.6312,0.028691
