In [9]:
import pandas as pd, numpy as np, re, time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pathlib import Path

RAW  = Path("../data/raw")
PROC = Path("../data/processed"); PROC.mkdir(exist_ok=True, parents=True)

an = SentimentIntensityAnalyzer()
#clean tweets to remove unneccesary info
def clean(txt: str) -> str:
    txt = re.sub(r"http\S+", "", str(txt))   
    txt = re.sub(r"@\w+", "", txt)           
    return txt.lower()


In [10]:
hdr = pd.read_csv(RAW/"Bitcoin_tweets.csv", nrows=0).columns.tolist()
print("Columns =", hdr)
TIME_COL  = "date" if "date" in hdr else (
            "timestamp" if "timestamp" in hdr else (
            "created_at" if "created_at" in hdr else None))
if TIME_COL is None:
    raise ValueError("Cannot locate a date/timestamp column in tweet CSV.")
print("Using TIME_COL =", TIME_COL)


Columns = ['user_name', 'user_location', 'user_description', 'user_created', 'user_followers', 'user_friends', 'user_favourites', 'user_verified', 'date', 'text', 'hashtags', 'source', 'is_retweet']
Using TIME_COL = date


In [11]:
import numpy as np

TIME_COL = "date"       
TEXT_COL = "text"       

daily = {}


def ingest_csv(fname):

    for chunk in pd.read_csv(
            RAW / fname,
            usecols=[TIME_COL, TEXT_COL],
            chunksize=100_000,
            engine="python",
            encoding="utf-8",
            on_bad_lines="skip"
            ):

        #timestemp -> datetime
        if np.issubdtype(chunk[TIME_COL].dtype, np.number):
            
            unit = "ms" if chunk[TIME_COL].iloc[0] > 1e11 else "s"
            chunk[TIME_COL] = pd.to_datetime(chunk[TIME_COL], unit=unit, utc=True)
        else:
            chunk[TIME_COL] = pd.to_datetime(chunk[TIME_COL], errors="coerce", utc=True)

        chunk = chunk.dropna(subset=[TIME_COL])    
        chunk["clean"] = chunk[TEXT_COL].map(clean)
        chunk["compound"] = chunk["clean"].map(lambda t: an.polarity_scores(t)["compound"])

        grp = chunk.groupby(chunk[TIME_COL].dt.date)["compound"].mean()
        for d, v in grp.items():
            daily.setdefault(d, []).append(v)

for f in ["Bitcoin_tweets.csv", "Bitcoin_tweets_dataset_2.csv"]:
    print("Scanning", f)
    ingest_csv(f)


Scanning Bitcoin_tweets.csv
Scanning Bitcoin_tweets_dataset_2.csv


In [13]:
sent_df = pd.DataFrame({
    "Date": list(daily.keys()),
    "sentiment": [np.mean(v) for v in daily.values()],
    "tweet_cnt": [len(v) for v in daily.values()]
}).sort_values("Date")

sent_df.to_csv(PROC / "btc_sentiment.csv", index=False)
sent_df.head()


Unnamed: 0,Date,sentiment,tweet_cnt
0,2021-02-05,0.132161,1
1,2021-02-06,0.14491,1
2,2021-02-07,0.152582,1
3,2021-02-08,0.164795,1
4,2021-02-09,0.159723,1
