In [9]:
!pip install ccxt praw pandas numpy textblob datetime load_dotenv

Collecting load_dotenv
  Downloading load_dotenv-0.1.0-py3-none-any.whl.metadata (1.9 kB)
Downloading load_dotenv-0.1.0-py3-none-any.whl (7.2 kB)
Installing collected packages: load_dotenv
Successfully installed load_dotenv-0.1.0


In [11]:
import ccxt
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta

In [153]:
# Load the environment variables
load_dotenv()

''

# Get Candle Stick Data from Binance

In [15]:
# Binance API credentials
BINANCE_API_KEY = os.getenv("BINANCE_API_KEY")
BINANCE_SECRET_KEY = os.getenv("BINANCE_SECRET_KEY")

# Initialize Binance Exchange
exchange = ccxt.binance({
    'apiKey': BINANCE_API_KEY,
    'secret': BINANCE_SECRET_KEY,
    'rateLimit': 1200,
    'enableRateLimit': True,
    'options': {
        'defaultType': 'future'  # Change to 'spot' for spot markets
    }
})

In [55]:
#Fetch the historical candle stick open, high, low, close and volume data
def fetch_ohlcv(symbol, timeframe, since, retries=3, progress_days=30):
    ohlcv = []  # Store all fetched data
    attempts = 0  # Track retries
    last_progress_message = since  # Track progress reporting

    while True:
        try:
            # Fetch 500 candles
            ohlcv_data = exchange.fetch_ohlcv(symbol, timeframe, since, limit=500)
            
            if ohlcv_data:
                ohlcv.extend(ohlcv_data)  # Add the new data to the list
                since = ohlcv_data[-1][0]  # Update the `since` for the next fetch
                
                # Check if progress message should be printed
                if since - last_progress_message >= progress_days * 24 * 60 * 60 * 1000:  # Progress every N days
                    print(f"Fetched up to {datetime.utcfromtimestamp(since / 1000)}")
                    last_progress_message = since

                # Stop if fewer than 500 candles are returned
                if len(ohlcv_data) < 500:
                    print(f"Fetching completed up to {datetime.utcfromtimestamp(since / 1000)}.")
                    break
            else:
                print(f"No more data to fetch at {datetime.utcfromtimestamp(since / 1000)}.")
                break
        except ccxt.NetworkError as e:
            print(f"Network error: {e}, retrying...")
            attempts += 1
            if attempts >= retries:
                print("Max retries reached. Exiting.")
                break
        except ccxt.ExchangeError as e:
            print(f"Exchange error: {e}, retrying...")
            attempts += 1
            if attempts >= retries:
                print("Max retries reached. Exiting.")
                break
        except Exception as e:
            print(f"Unexpected error: {e}, retrying...")
            attempts += 1
            if attempts >= retries:
                print("Max retries reached. Exiting.")
                break

    return ohlcv

In [61]:
# Symbol and timeframe
symbol = 'BTC/USDT'
timeframe = '1h'  # 1-hour timeframe
since = exchange.parse8601('2015-01-01T00:00:00Z')  # Start from Jan 1, 2015

In [63]:
# Fetch historical data
ohlcv = fetch_ohlcv(symbol, timeframe, since)

Network error: binance GET https://fapi.binance.com/fapi/v1/klines?interval=1h&limit=500&symbol=BTCUSDT&startTime=1420070400000, retrying...


  print(f"Fetched up to {datetime.utcfromtimestamp(since / 1000)}")


Fetched up to 2019-09-29 12:00:00
Fetched up to 2019-11-10 02:00:00
Fetched up to 2019-12-21 16:00:00
Fetched up to 2020-02-01 06:00:00
Fetched up to 2020-03-13 20:00:00
Fetched up to 2020-04-24 10:00:00
Fetched up to 2020-06-05 00:00:00
Fetched up to 2020-07-16 14:00:00
Fetched up to 2020-08-27 04:00:00
Fetched up to 2020-10-07 18:00:00
Fetched up to 2020-11-18 08:00:00
Fetched up to 2020-12-29 22:00:00
Fetched up to 2021-02-09 12:00:00
Fetched up to 2021-03-23 02:00:00
Fetched up to 2021-05-03 16:00:00
Fetched up to 2021-06-14 06:00:00
Fetched up to 2021-07-25 20:00:00
Fetched up to 2021-09-05 10:00:00
Fetched up to 2021-10-17 00:00:00
Fetched up to 2021-11-27 14:00:00
Fetched up to 2022-01-08 04:00:00
Fetched up to 2022-02-18 18:00:00
Fetched up to 2022-04-01 08:00:00
Fetched up to 2022-05-12 22:00:00
Fetched up to 2022-06-23 12:00:00
Fetched up to 2022-08-04 02:00:00
Fetched up to 2022-09-14 16:00:00
Fetched up to 2022-10-26 06:00:00
Fetched up to 2022-12-06 20:00:00
Fetched up to 

  print(f"Fetching completed up to {datetime.utcfromtimestamp(since / 1000)}.")


In [65]:
# Convert to DataFrame
df = pd.DataFrame(ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])

# Convert timestamp to readable format
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

# Display the first few rows of the data
print(df.head())

# Save the data to a CSV file
df.to_csv('btc_usdt_ohlcv_1h_5years.csv', index=False)

            timestamp      open      high       low     close   volume
0 2019-09-08 17:00:00  10000.00  10000.00  10000.00  10000.00    0.002
1 2019-09-08 18:00:00  10000.00  10000.00  10000.00  10000.00    0.000
2 2019-09-08 19:00:00  10344.77  10357.53  10337.43  10340.12  471.659
3 2019-09-08 20:00:00  10340.12  10368.64  10334.54  10351.42  583.271
4 2019-09-08 21:00:00  10351.42  10391.90  10324.77  10391.90  689.759


In [67]:
df.shape

(46460, 6)

# Get Reddit Posts Related to BTC for Sentiment Analysis

In [77]:
import praw
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta
from praw.models import Submission
import json
import time
# Load the environment variables
load_dotenv()
# Reddit API credentials
REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET = os.getenv("REDDIT_SECRET")
REDDIT_USER_AGENT = os.getenv("REDDIT_USER_AGENT")
REDDIT_USERNAME = os.getenv("REDDIT_USERNAME")
REDDIT_PASSWORD = os.getenv("REDDIT_PASSWORD")

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_SECRET,
    user_agent=REDDIT_USER_AGENT,
    username=REDDIT_USERNAME,
    password=REDDIT_PASSWORD
)

In [79]:
# Subreddits to search
subreddits = ["Bitcoin"]


In [81]:
def fetch_btc_posts(subreddits, query="Bitcoin", start_date="2019-01-01", progress_interval=100):
    posts = []
    total_fetched = 0
    start_timestamp = int(datetime.strptime(start_date, "%Y-%m-%d").timestamp())
    current_timestamp = int(datetime.utcnow().timestamp())
    
    for subreddit_name in subreddits:
        subreddit = reddit.subreddit(subreddit_name)
        print(f"Fetching posts from r/{subreddit_name}...")
        
        fetched_count = 0
        day_start = start_timestamp
        
        while day_start < current_timestamp:
            day_end = day_start + 86400  # Add 1 day (in seconds)
            
            try:
                # Search posts for the specific day
                for post in subreddit.search(query, sort="new", limit=500, params={"after": day_start, "before": day_end}):
                    if isinstance(post, Submission):  # Ensure it's a submission
                        posts.append({
                            "subreddit": subreddit_name,
                            "title": post.title,
                            "text": post.selftext,
                            "created_utc": datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                            "score": post.score,
                            "url": post.url,
                        })
                        fetched_count += 1
                        total_fetched += 1

                        # Update progress
                        if fetched_count % progress_interval == 0:
                            print(f"Fetched {fetched_count} posts from r/{subreddit_name} so far...")

                print(f"Finished fetching {fetched_count} posts from r/{subreddit_name} for {datetime.utcfromtimestamp(day_start).strftime('%Y-%m-%d')}.")
            
            except Exception as e:
                print(f"Error fetching posts from r/{subreddit_name} for {datetime.utcfromtimestamp(day_start).strftime('%Y-%m-%d')}: {e}")
                time.sleep(1)  # Brief delay to avoid rate-limiting
                
            # Move to the next day
            day_start = day_end
    
    # Save posts to JSON
    with open("btc_posts.json", "w") as json_file:
        json.dump(posts, json_file, indent=4)
    
    print(f"Total posts fetched: {total_fetched}. Saved to 'btc_posts.json'.")
    return posts

In [83]:
# Fetch posts
btc_posts = fetch_btc_posts(subreddits, query="BTC")

# Convert to DataFrame for better analysis
df = pd.DataFrame(btc_posts)

# Save to CSV
df.to_csv("btc_reddit_posts.csv", index=False)

print("Fetched and saved BTC-related posts.")

  current_timestamp = int(datetime.utcnow().timestamp())


Fetching posts from r/Bitcoin...


  "created_utc": datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'),


Fetched 100 posts from r/Bitcoin so far...
Fetched 200 posts from r/Bitcoin so far...


  print(f"Finished fetching {fetched_count} posts from r/{subreddit_name} for {datetime.utcfromtimestamp(day_start).strftime('%Y-%m-%d')}.")


Finished fetching 249 posts from r/Bitcoin for 2018-12-31.
Fetched 300 posts from r/Bitcoin so far...
Fetched 400 posts from r/Bitcoin so far...
Finished fetching 498 posts from r/Bitcoin for 2019-01-01.
Fetched 500 posts from r/Bitcoin so far...
Fetched 600 posts from r/Bitcoin so far...
Fetched 700 posts from r/Bitcoin so far...
Finished fetching 747 posts from r/Bitcoin for 2019-01-02.
Fetched 800 posts from r/Bitcoin so far...
Fetched 900 posts from r/Bitcoin so far...
Finished fetching 996 posts from r/Bitcoin for 2019-01-03.
Fetched 1000 posts from r/Bitcoin so far...
Fetched 1100 posts from r/Bitcoin so far...
Fetched 1200 posts from r/Bitcoin so far...
Finished fetching 1245 posts from r/Bitcoin for 2019-01-04.
Fetched 1300 posts from r/Bitcoin so far...
Fetched 1400 posts from r/Bitcoin so far...
Finished fetching 1494 posts from r/Bitcoin for 2019-01-05.
Fetched 1500 posts from r/Bitcoin so far...
Fetched 1600 posts from r/Bitcoin so far...
Fetched 1700 posts from r/Bitcoin s

In [85]:
df.shape

(543417, 6)