In [17]:
import requests
import pandas as pd
from scipy.stats import pearsonr

In [10]:
# Import and read the CSV file
df = pd.read_csv('bitcoin_sentiments_21_24_cleaned_benchmarked.csv')

# Convert Date column to date only
df['Date'] = pd.to_datetime(df['Date']).dt.date

# Print total counts for each label
column_name = 'Sentiment_Category'  # Change this to your column name
print("\nTotal counts for each label:")
print(df[column_name].value_counts())

# Create DataFrame with counts per date
daily_counts = df.groupby('Date')[column_name].value_counts().unstack(fill_value=0)
daily_counts.columns = ['num_positive', 'num_negative', 'num_neutral']  # Rename columns for clarity

# Reset index to make Date a column
daily_counts = daily_counts.reset_index()

# Display first few rows
print("\nDaily counts:")
print(daily_counts.head())


Total counts for each label:
2    4292
0    3873
1    2840
Name: Sentiment_Category, dtype: int64

Daily counts:
         Date  num_positive  num_negative  num_neutral
0  2021-11-05             2             0            3
1  2021-11-06             0             0            1
2  2021-11-08             0             1            3
3  2021-11-09             1             0            3
4  2021-11-10             1             2            1


In [14]:
def aggregate_all_sentiments(column_name):
    """
    Import and aggregate all sentiment metrics into a single DataFrame.
    
    Parameters:
    column_name (str): Name of the column to aggregate
    
    Returns:
    DataFrame: DataFrame with date and all sentiment metrics
    """
    # Import the CSV file
    df = pd.read_csv('bitcoin_sentiments_21_24_cleaned_benchmarked.csv')
    
    # Convert Date column to date only (removing time)
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    
    # Create a base DataFrame with daily metrics
    daily_metrics = df.groupby('Date').agg({
        column_name: [
            # Majority sentiment
            ('majority_sentiment', lambda x: x.mode()[0]),
            # Percentage of positive (0)
            ('percent_positive', lambda x: (x == 0).mean() * 100),
            # Percentage of negative (1)
            ('percent_negative', lambda x: (x == 1).mean() * 100),
            # Percentage of neutral (2)
            ('percent_neutral', lambda x: (x == 2).mean() * 100),
            # Total sentiment score (sum of +1 for positive, -1 for negative, 0 for neutral)
            ('sentiment_score', lambda x: ((x == 0) * 1 + (x == 1) * -1 + (x == 2) * 0).sum())
        ]
    })
    
    # Flatten column names and reset index
    daily_metrics.columns = daily_metrics.columns.get_level_values(1)
    daily_metrics = daily_metrics.reset_index()
    
    return daily_metrics

# Create single DataFrame with all metrics
daily_sentiment_metrics = aggregate_all_sentiments('Sentiment_Category')

In [15]:
daily_sentiment_metrics

Unnamed: 0,Date,majority_sentiment,percent_positive,percent_negative,percent_neutral,sentiment_score
0,2021-11-05,2,40.000000,0.000000,60.000000,2
1,2021-11-06,2,0.000000,0.000000,100.000000,0
2,2021-11-08,2,0.000000,25.000000,75.000000,-1
3,2021-11-09,2,25.000000,0.000000,75.000000,1
4,2021-11-10,1,25.000000,50.000000,25.000000,-1
...,...,...,...,...,...,...
881,2024-09-08,1,33.333333,44.444444,22.222222,-1
882,2024-09-09,0,50.000000,20.833333,29.166667,7
883,2024-09-10,0,72.727273,9.090909,18.181818,14
884,2024-09-11,1,22.222222,62.962963,14.814815,-11


In [43]:
#load bitcoin prices
bitcoin_prices = pd.read_csv('bitcoin_data.csv')

#convert the date column to date instead of datetime
bitcoin_prices['Date'] = pd.to_datetime(bitcoin_prices['Date']).dt.date
#add future return columns
bitcoin_prices['Return_D1'] = bitcoin_prices['Price'].shift(-1) / bitcoin_prices['Price'] - 1
bitcoin_prices['Return_W1'] = bitcoin_prices['Price'].shift(-7) / bitcoin_prices['Price'] - 1
bitcoin_prices['Return_M1'] = bitcoin_prices['Price'].shift(-30) / bitcoin_prices['Price'] - 1
bitcoin_prices['Return_Q1'] = bitcoin_prices['Price'].shift(-90) / bitcoin_prices['Price'] - 1
bitcoin_prices['Return_Y1'] = bitcoin_prices['Price'].shift(-365) / bitcoin_prices['Price'] - 1

In [44]:
#transform returns into 0, 1, 2
bitcoin_returns_classified = bitcoin_prices[['Return_D1', 'Return_W1', 'Return_M1', 'Return_Q1', 'Return_Y1']].applymap(
    lambda x: 1 if x > 0 else (-1 if x < 0 else 0)
)

bitcoin_returns_classified["Date"] = bitcoin_prices["Date"]

In [48]:
def analyze_bitcoin_sentiment_correlation(dates, sentiment_scores):
    """
    Performs Pearson correlation analysis between sentiment scores and Bitcoin returns.

    Parameters:
    - dates (list of str): List of dates in 'YYYY-MM-DD' format.
    - sentiment_scores (list of int): Corresponding sentiment scores (-1, 0, 1).

    Returns:
    - dict: Correlation coefficients for next day, next week, next month, next quarter, and next year returns.
    """
    if len(dates) != len(sentiment_scores):
        print("Error: The number of dates and sentiment scores must be equal.")
        return

    # Create DataFrame from dates and sentiment scores
    sentiment_df = pd.DataFrame({
        'Date': dates,
        'Sentiment': sentiment_scores
    })

    # Merge with bitcoin_prices DataFrame on Date
    merged_df = pd.merge(sentiment_df, bitcoin_prices, on='Date', how='inner')

    if merged_df.empty:
        print("No overlapping data between sentiment and bitcoin prices.")
        return

    # Compute Pearson correlation for each return period without dropping NaNs globally
    correlations = {}
    return_periods = {
        'Return_D1': 'Next Day',
        'Return_W1': 'Next Week',
        'Return_M1': 'Next Month',
        'Return_Q1': 'Next Quarter',
        'Return_Y1': 'Next Year'
    }

    for period, label in return_periods.items():
        # Select relevant columns and drop NaNs for the current period
        df_period = merged_df[['Sentiment', period]].dropna()
        if not df_period.empty:
            corr, _ = pearsonr(df_period['Sentiment'], df_period[period])
            correlations[period] = corr
        else:
            correlations[period] = None

    # Print the results
    print("Correlation Results:")
    for period, label in return_periods.items():
        corr_value = correlations[period]
        if corr_value is not None:
            print(f"{label}: {corr_value:.4f}")
        else:
            print(f"{label}: No data")
    
# Example usage:
# dates = ["2021-05-11", "2021-05-12", ...]
# sentiment_scores = [1, -1, ...]
# analyze_bitcoin_sentiment_correlation(dates, sentiment_scores)

In [49]:
correlation_results = analyze_bitcoin_sentiment_correlation(daily_sentiment_metrics['Date'], daily_sentiment_metrics['percent_positive'])

Correlation Results:
Next Day: 0.0311
Next Week: 0.0551
Next Month: 0.1485
Next Quarter: 0.1365
Next Year: 0.2197


In [31]:
correlation_results = analyze_bitcoin_sentiment_correlation(daily_sentiment_metrics['Date'], daily_sentiment_metrics['percent_negative'])

Correlation Results:
Next Day: -0.0068
Next Week: 0.0545
Next Month: 0.0502
Next Quarter: 0.0860
Next Year: 0.0882


In [45]:
correlation_results = analyze_bitcoin_sentiment_correlation(daily_sentiment_metrics['Date'], daily_sentiment_metrics['sentiment_score'])

Correlation Results:
Next Day: -0.0379
Next Week: -0.0143
Next Month: 0.0309
Next Quarter: 0.0511
Next Year: 0.0872


In [46]:
correlation_results = analyze_bitcoin_sentiment_correlation(daily_sentiment_metrics['Date'], daily_sentiment_metrics['majority_sentiment'])

Correlation Results:
Next Day: 0.0090
Next Week: -0.0253
Next Month: -0.0991
Next Quarter: -0.1675
Next Year: -0.1863
