<a href="https://colab.research.google.com/github/caruanajoey/RedditMemeCoinPredictorProject/blob/main/ECON420FinalProjectRedditMemeCoinPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Final Project For ECON420 Machine Learning by Giuseppe Caruana 261115024

# Installs

In [None]:
!pip install praw transformers torch tensorflow scikit-learn python-dotenv pycoingecko requests pandas numpy matplotlib seaborn langdetect ta


# Imports

In [None]:
import os
import re
import time
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from langdetect import detect

# For Reddit
import praw

# For CoinGecko Market Data
from pycoingecko import CoinGeckoAPI

# For Sentiment Analysis
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# ML + Deep Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Technical Indicators
import ta

# Disable unnecessary logging
import logging
logging.basicConfig(level=logging.ERROR)


# Dictionary for CoinGeckoID

In [None]:
TICKER_MAP = {
    "DOGE": "dogecoin",
    "SHIB": "shiba-inu",
    "FLOKI": "floki",
    "ELON": "dogelon-mars",
    "PEPE": "pepe",
    "BONK": "bonk",
    "WIF": "dogwifcoin",
    "PENGU": "pudgy-penguins",
}



# User Input for Coin Choice

In [None]:
def get_user_ticker_choice():
    #Prompt the user for a ticker symbol and validate it against TICKER_MAP.
    print("Available Tickers:")
    for ticker in TICKER_MAP.keys():
        print(f"- {ticker}")

    coin_ticker = input("\nEnter the meme coin ticker (e.g., DOGE, SHIB): ").strip().upper()
    if coin_ticker not in TICKER_MAP:
        print(f"\nTicker '{coin_ticker}' not found in TICKER_MAP. Please update or try another.")
        raise SystemExit
    return coin_ticker



# Reddit API info

In [None]:
REDDIT_CLIENT_ID = "RidrSLnK8tNcAtD8LNccNg"
REDDIT_CLIENT_SECRET = "122v92C6r5WzMNNNQW3vg8KDIHsczA"
REDDIT_USER_AGENT = "JoeysApp/0.0.1"

def get_reddit_client(client_id, client_secret, user_agent):
    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent
    )
    return reddit


#Fetching Data

Reddit data

In [None]:
def fetch_comments_for_submission(submission, max_comments=20):
    #Fetch up to 'max_comments'

    submission.comments.replace_more(limit=0)
    comment_texts = []
    count = 0
    for c in submission.comments:
        if count >= max_comments:
            break
        comment_texts.append(c.body)
        count += 1
    return " ".join(comment_texts)

def fetch_reddit_data_enhanced(subreddit_name="memecoins", total_posts=200,categories=("hot","new","top"), max_comments=20, reddit_client=None):
   #Returns a DataFrame with combined text (post + comments), upvotes, num_comments, category, timestamp.

    if reddit_client is None:
        raise ValueError("A valid PRAW Reddit client must be provided.")

    all_posts = []
    subreddit = reddit_client.subreddit(subreddit_name)

    for category in categories:
        print(f"\n--- Fetching category='{category}' from r/{subreddit_name} with pagination ---")
        fetched_posts = 0
        after_fullname = None

        while fetched_posts < total_posts:
            batch_limit = min(100, total_posts - fetched_posts)
            params = {}
            if after_fullname:
                params["after"] = after_fullname

            listing = getattr(subreddit, category)(limit=batch_limit, params=params)

            count_this_batch = 0
            try:
                for submission in listing:
                    # Combine post text + up to 'max_comments' from the submission
                    post_text = submission.selftext or ""
                    comments_text = fetch_comments_for_submission(submission, max_comments=max_comments)
                    combined_text = post_text + "\n" + comments_text

                    all_posts.append({
                        "timestamp": datetime.utcfromtimestamp(submission.created_utc),
                        "title": submission.title,
                        "text": combined_text,
                        "upvotes": submission.score,
                        "num_comments": submission.num_comments,
                        "category": category
                    })

                    after_fullname = submission.fullname  # pagination pointer
                    fetched_posts += 1
                    count_this_batch += 1

                    if fetched_posts >= total_posts:
                        break
            except Exception as e:
                print(f"Error fetching {category} posts: {e}")
                break

            if count_this_batch == 0:
                # No more posts returned, break
                break

    df = pd.DataFrame(all_posts)
    return df



CoinGecko data

In [None]:
def fetch_coingecko_data(coin_id, vs_currency="usd", days=7, interval="hourly"):
    cg = CoinGeckoAPI()
    try:
        # OHLC
        ohlc = cg.get_coin_ohlc_by_id(id=coin_id, vs_currency=vs_currency, days=days)
        df = pd.DataFrame(ohlc, columns=['timestamp','open','high','low','close'])
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

        # Volume
        market_data = cg.get_coin_market_chart_by_id(id=coin_id, vs_currency=vs_currency, days=days)
        volume = market_data.get('total_volumes', [])
        vol_df = pd.DataFrame(volume, columns=['timestamp','volume'])
        vol_df['timestamp'] = pd.to_datetime(vol_df['timestamp'], unit='ms')

        # Merge
        df = pd.merge_asof(df.sort_values('timestamp'),
                           vol_df.sort_values('timestamp'),
                           on='timestamp')
        return df
    except Exception as e:
        print(f"Error fetching from CoinGecko: {e}")
        return pd.DataFrame()



Cleaning up the data

In [None]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z0-9\s#@]', '', text)
    text = text.lower().strip()
    return text

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False



#sentiment score

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
sentiment_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

def get_sentiment_score(text):
    if not text.strip():
        return 0.0
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256, padding=True)
        with torch.no_grad():
            logits = sentiment_model(**inputs).logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
        return probs[1] - probs[0]
    except Exception as e:
        print(f"Sentiment error: {e}")
        return 0.0


# Processing and Merging Data

In [None]:
def process_and_merge_data(social_df, market_df):
    social_df["timestamp"] = pd.to_datetime(social_df["timestamp"])
    social_df.set_index("timestamp", inplace=True)

    social_hourly = social_df.resample("H").agg({
        "sentiment_score":"mean",
        "upvotes":"sum",
        "num_comments":"sum"
    }).reset_index()

    social_hourly.rename(columns={
        "sentiment_score":"avg_sentiment_score",
        "upvotes":"total_upvotes",
        "num_comments":"total_comments"
    }, inplace=True)

    market_df["timestamp"] = pd.to_datetime(market_df["timestamp"])
    market_df.sort_values("timestamp", inplace=True)
    market_df.set_index("timestamp", inplace=True)

    market_hourly = market_df.resample("H").agg({
        "open":"first",
        "high":"max",
        "low":"min",
        "close":"last",
        "volume":"sum"
    }).dropna().reset_index()

    merged_df = pd.merge_asof(
        social_hourly.sort_values('timestamp'),
        market_hourly.sort_values('timestamp'),
        on='timestamp'
    )
    merged_df.dropna(inplace=True)
    return merged_df




extra features

In [None]:
def features(merged_df):

    merged_df["lagged_sentiment"] = merged_df["avg_sentiment_score"].shift(1)
    merged_df["close_sma_3"] = merged_df["close"].rolling(3).mean()

    # RSI
    merged_df["rsi"] = ta.momentum.RSIIndicator(close=merged_df["close"], window=14).rsi()

    # Time-based features
    merged_df["hour_of_day"] = merged_df["timestamp"].dt.hour
    merged_df["day_of_week"] = merged_df["timestamp"].dt.dayofweek

    merged_df.dropna(inplace=True)

    # future_close
    merged_df["future_close"] = merged_df["close"].shift(-1)
    merged_df.dropna(inplace=True)

    return merged_df


Window Sequence

In [None]:
def create_window_sequences(X, y, window_size=3):
    X_seq, y_seq = [], []
    for i in range(len(X) - window_size):
        X_seq.append(X[i:i+window_size])
        y_seq.append(y[i+window_size])
    return np.array(X_seq), np.array(y_seq)



24 hour forecast

In [None]:
def multi_step_forecast_24hrs(model, scaler, last_data_scaled, window_size=3):
    predictions_scaled = []

    current_window = last_data_scaled.copy()

    for step in range(24):
        current_window_reshaped = np.expand_dims(current_window, axis=0)
        pred_scaled = model.predict(current_window_reshaped).flatten()[0]
        predictions_scaled.append(pred_scaled)
        new_row = np.zeros((current_window.shape[1],))
        new_row[-1] = pred_scaled
        current_window = np.vstack([current_window[1:], new_row])

    predictions_inversed = []
    for ps in predictions_scaled:
        placeholder = np.zeros((1, scaler.n_features_in_))
        placeholder[0, -1] = ps
        inv = scaler.inverse_transform(placeholder)
        predictions_inversed.append(inv[0, -1])

    return predictions_inversed



# Main

In [None]:
def main():
    coin_ticker = get_user_ticker_choice()
    coingecko_id = TICKER_MAP[coin_ticker]

    print(f"\nFetching data from r/memecoins with multiple categories, pagination, and comments ...")
    reddit_client = get_reddit_client(REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT)

    # total_posts=300 per category, categories = hot,new,top
    reddit_df = fetch_reddit_data_enhanced(
        subreddit_name="memecoins",
        total_posts=300,
        categories=("hot","new","top"),
        max_comments=20,
        reddit_client=reddit_client
    )
    if reddit_df.empty:
        print("No data from r/memecoins. Exiting.")
        return

    print(f"\nFetching market data for {coin_ticker} from CoinGecko (id='{coingecko_id}') ...")
    market_df = fetch_coingecko_data(coin_id=coingecko_id, vs_currency="usd", days=7, interval="hourly")
    if market_df.empty:
        print("No market data from CoinGecko. Exiting.")
        return

    # Preprocess
    print("\nCleaning & filtering r/memecoins data (English only)...")
    reddit_df.dropna(subset=["text"], inplace=True)
    reddit_df["cleaned_text"] = reddit_df["text"].apply(clean_text)
    reddit_df = reddit_df[reddit_df["cleaned_text"].apply(is_english)]

    # Sentiment
    print("Performing sentiment analysis (post + comments text) ...")
    reddit_df["sentiment_score"] = reddit_df["cleaned_text"].apply(get_sentiment_score)
    if reddit_df.empty:
        print("No posts left after cleaning/filtering. Exiting.")
        return

    # Merge
    print("Merging social data with market data ...")
    merged_df = process_and_merge_data(reddit_df, market_df)
    if merged_df.shape[0] < 10:
        print("Merged dataset too small for training. Exiting.")
        return

    # 6) features
    merged_df = features(merged_df)
    print(f"\nFinal dataset shape: {merged_df.shape}")
    display(merged_df.head())

    FEATURES = [
        "avg_sentiment_score",
        "lagged_sentiment",
        "close",
        "close_sma_3",
        "volume",
        "rsi",
        "hour_of_day",
        "day_of_week"
    ]
    TARGET = "future_close"

    df_for_scale = merged_df[FEATURES + [TARGET]].copy()
    data_np = df_for_scale.values

    # Scale features + target
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data_np)

    X_scaled = data_scaled[:, :-1]  # all but last col
    y_scaled = data_scaled[:, -1]   # last col

    # Train/Test Split
    n = len(X_scaled)
    split_index = int(n * 0.8)
    X_train_scaled, X_test_scaled = X_scaled[:split_index], X_scaled[split_index:]
    y_train_scaled, y_test_scaled = y_scaled[:split_index], y_scaled[split_index:]

    # Create window sequences
    WINDOW_SIZE = 3
    X_train_seq, y_train_seq = create_window_sequences(X_train_scaled, y_train_scaled, window_size=WINDOW_SIZE)
    X_test_seq, y_test_seq = create_window_sequences(X_test_scaled, y_test_scaled, window_size=WINDOW_SIZE)

    print(f"\nTrain sequences shape: {X_train_seq.shape}, {y_train_seq.shape}")
    print(f"Test sequences shape:  {X_test_seq.shape}, {y_test_seq.shape}")

    # Build LSTM
    model = keras.Sequential([
        layers.LSTM(64, return_sequences=True, input_shape=(WINDOW_SIZE, len(FEATURES))),
        layers.Dropout(0.2),
        layers.LSTM(32),
        layers.Dropout(0.2),
        layers.Dense(16, activation='relu'),
        layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.summary()

    EPOCHS = 15
    BATCH_SIZE = 16

    # Train
    print("\nTraining the LSTM model ...")
    history = model.fit(
        X_train_seq, y_train_seq,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_split=0.1,
        verbose=1
    )

    print("\nEvaluating on test set ...")
    y_pred_scaled = model.predict(X_test_seq).flatten()

    # Inverse trans preds
    def inverse_transform_predictions(scaled_array):
        n_test = len(scaled_array)
        placeholder = np.zeros((n_test, data_scaled.shape[1]))
        placeholder[:, -1] = scaled_array
        inv_data = scaler.inverse_transform(placeholder)
        return inv_data[:, -1]

    y_pred_inversed = inverse_transform_predictions(y_pred_scaled)
    y_test_inversed = inverse_transform_predictions(y_test_seq)

    # Eval
    from math import sqrt
    mse = mean_squared_error(y_test_inversed, y_pred_inversed)
    rmse = sqrt(mse)
    print(f"\nTest RMSE: {rmse:.5f}")

    # Visuals


    # Plot Actual vs Predicted
    plt.figure(figsize=(10,5))
    plt.plot(range(len(y_test_inversed)), y_test_inversed, label='Actual Price', color='blue')
    plt.plot(range(len(y_pred_inversed)), y_pred_inversed, label='Predicted Price', color='red')
    plt.title(f"{coin_ticker} Price Prediction (Test Set) - Inverse Transformed")
    plt.xlabel("Test Steps")
    plt.ylabel("Price (USD)")
    plt.legend()
    plt.show()

    # Plot training & validation loss
    plt.figure(figsize=(10,5))
    plt.plot(history.history['loss'], label='Training Loss', color='blue')
    plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
    plt.title("Model Loss During Training")
    plt.xlabel("Epoch")
    plt.ylabel("Loss (MSE)")
    plt.legend()
    plt.show()

    # Plot error distribution
    errors = y_test_inversed - y_pred_inversed
    plt.figure(figsize=(8,5))
    sns.histplot(errors, kde=True, color='purple')
    plt.title("Distribution of Prediction Errors (Test Set)")
    plt.xlabel("Error (Actual - Predicted)")
    plt.show()

    last_data_window = X_scaled[-WINDOW_SIZE:, :]

    next_24hrs = multi_step_forecast_24hrs(model, scaler, last_data_window, window_size=WINDOW_SIZE)

    forecast_hours = list(range(1, 25))
    forecast_df = pd.DataFrame({
        "Hour Ahead": forecast_hours,
        "Predicted_Close": next_24hrs
    })

    print("\nPredicted prices for the next 24 hours (hourly):")
    display(forecast_df.head(24))

    # The final predicted price after 24 hours
    next_day_price = next_24hrs[-1]

    print(f"\n\033[95mThe predicted closing price for {coin_ticker} in ~24 hours is: ${next_day_price:.9f}\033[0m")
    print("That's your next day's price estimate, based on the last known data.\n")


#running the main

In [None]:
if __name__ == "__main__":
    main()

#Report



This code was created to gather, process, and analyze data related to meme coins—cryptocurrencies known for their viral marketing and passionate online communities. The main goal is to determine whether combining social media sentiment with traditional market information can provide reliable short-term forecasts of a coin’s price. Essentially, the script follows three key steps: it collects social data from Reddit, retrieves corresponding price and volume data from CoinGecko, and merges everything into a dataset suitable for a deep learning model. By doing this, it aims to understand how collective opinions and discussions might influence real-time market behavior.

Initially, I wanted to gather data from platforms like Twitter and TikTok. The idea was that a broader coverage would help identify trends that might not be visible when focusing on just one platform. Traders, both beginners and seasoned professionals, often use hashtags or short threads to share opinions that can sometimes move markets. Similarly, TikTok has surged in popularity among younger audiences, where videos can quickly go viral and capture the attention of young investors. Including these platforms seemed like a sure way to gather a richer source of sentiment data. However, I ran into some obstacles with the free API tiers offered by Twitter and TikTok, which made data gathering more complicated than expected. Twitter’s basic API access used to be fairly open, but recent policy changes and tighter restrictions have made it much harder to collect meaningful amounts of data without paying for higher-tier plans. With free access, you can only retrieve a limited number of tweets daily or monthly, severely limiting any potential analysis. TikTok, while extremely popular, doesn’t offer much in the way of official support for developers looking to gather large-scale data. I attempted to use unofficial scraping methods, but that came with its own challenges. Given these issues, it became clear that reliably fetching data from Twitter and TikTok at scale wasn’t practical with free plans or easily accessible APIs.

With that realization, the focus was shifted to Reddit. Reddit turned out to be a much more manageable platform for gathering sentiment data. Its API is relatively open, and the platform naturally supports long-form discussions and comment threads. Instead of short tweets or videos, Reddit users typically post detailed submissions and replies. The script leverages Reddit’s Python library, PRAW, to collect posts from a chosen subreddit—in this case, “memecoins.” It also looks at multiple categories like “hot,” “new,” and “top” to cover a broader range of user-generated content. Once the data is collected, the script cleans up the text by removing links, special characters, and symbols that could interfere with language processing. It also filters out non-English content to keep the text data consistent and avoid translation issues. The final step in handling the text data involves sentiment analysis, where a pre-trained DistilBERT model is used to generate a continuous score that estimates how positive or negative each Reddit post is.

On the market data side, the code relies on the CoinGecko API. Each meme coin has a unique ID on CoinGecko (like “dogecoin” or “shiba-inu”), and the script uses that ID to fetch the last several days of hourly price candles and trading volumes. This market data includes open, high, low, and close (OHLC) prices, which are standard metrics in trading analytics. After retrieving this information, the script combines it with the sentiment data. This merging is done based on the closest hourly timestamp, resulting in a single table that includes both social and market features. This gives a more complete picture of what might be influencing short-term price movements.

After these steps, the script moves on to engineer additional features, such as moving averages of the coin’s closing price and the RSI (Relative Strength Index), a popular technical indicator. It also creates lagged versions of certain columns to ensure that past data can help predict future outcomes. By shifting the closing price forward, the script creates a “future_close” column, which becomes the target variable for the model to predict. The data is then normalized using the MinMaxScaler utility and is finally ready to be fed into a neural network. The chosen architecture is an LSTM, or Long Short-Term Memory model, which is well-suited for sequential data. The model includes two LSTM layers, each followed by dropout layers to help prevent overfitting. These layers handle the complexities of time dependencies in the data. The script then trains the model on the majority of the data, reserving a portion as a test set to evaluate the model’s performance on unseen data.

There are several areas where the code could be improved. One obvious area is the breadth of sentiment analysis. Currently, the script uses a single pre-trained DistilBERT model to generate sentiment scores. While this approach is practical, it might sometimes misinterpret slang, sarcasm, or crypto-specific jargon. Training a specialized model or at least fine-tuning the existing model for the crypto domain could provide more accurate and detailed sentiment scores, especially since meme coin communities often rely heavily on humor and memes. Moreover, the script could benefit from integrating additional sources of data beyond just Reddit and CoinGecko. For example, incorporating news articles, blog posts, or other forms of online content could provide a more comprehensive view of the factors influencing meme coin prices. Combining these various data streams could help the model capture a wider array of signals that drive market behavior.

In summary, this code began with a broader vision of incorporating data from Twitter, TikTok, and Reddit to capture the full diversity of online chatter around meme coins. However, practical limitations imposed by the free APIs of Twitter and TikTok forced a pivot to focus solely on Reddit. Despite this setback, Reddit proved to be a valuable and accessible source for text-based sentiment data. By pairing this social sentiment data with CoinGecko’s market information, the code successfully creates a dynamic dataset that a deep learning model can use to predict short-term price changes. While there are clear opportunities for enhancements—such as refining sentiment analysis, expanding the range of technical features, or fine-tuning hyperparameters—the existing script provides a functional and insightful example of how social media sentiment can be integrated into a traditional market prediction pipeline.
