1. Environment setup

In [None]:
pip install pandas tweepy snscrape==0.7.0.20230622 google-news-api pytrends google_trends pandas_gbq duckdb

3 Code snippets by data source

3.1 Social-media firehose → DataFrame

In [None]:
import snscrape.modules.twitter as sntw
import pandas as pd
from datetime import datetime

QUERY = "HDFC Bank lang:en since:2025-06-01 until:2025-07-01"

tweets = []
for i, tweet in enumerate(sntw.TwitterSearchScraper(QUERY).get_items()):
    if i == 10_000:          # quick demo cap
        break
    tweets.append({
        "tweet_id": tweet.id,
        "ts_utc": tweet.date,
        "user": tweet.user.username,
        "text": tweet.content,
        "reply_ct": tweet.replyCount,
        "rt_ct": tweet.retweetCount,
        "fav_ct": tweet.likeCount,
    })

tw_df = pd.DataFrame(tweets).set_index("ts_utc").sort_index()


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
tw_df["vader"] = tw_df["text"].map(lambda t: sia.polarity_scores(t)["compound"])
hourly = tw_df["vader"].resample("1H").mean()


3.2 Professional news feed → DataFrame

In [None]:
from gnews import GNews   # Google News wrapper
gn = GNews(lang="en", max_results=100, start_date=(2025,6,1), end_date=(2025,7,1))
news_items = gn.get_news("HDFC Bank")

news_df = (
    pd.DataFrame(news_items)
      .assign(published=lambda d: pd.to_datetime(d["published date"]))
      .rename(columns={"title":"headline", "description":"snippet"})
      .set_index("published")
      .sort_index()
)


3.3 Search Volume Index (Google Trends) → DataFrame


In [None]:
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=330)   # India (UTC+5:30)
kw_list = ["HDFC Bank"]
pytrends.build_payload(kw_list, timeframe='2025-06-01 2025-07-01', geo='IN')
gtrend_df = (
    pytrends.interest_over_time()
             .drop(columns="isPartial")
             .rename(columns={"HDFC Bank":"gtrend"})
)


4 Structuring & stitching the pieces together

In [None]:
# Align to hourly buckets to merge cleanly
def to_hourly(df, col):
    return df[col].resample("1H").mean().to_frame()

panel = (
    to_hourly(tw_df, "vader")
      .join(to_hourly(news_df, "headline").rename(columns={"headline":"news_ct"}))
      .join(to_hourly(gtrend_df, "gtrend"))
      .fillna(0)
)

panel.head()


## 6. exploratory data analysis of sample data ##

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore


def generate_data(seed: int = 42) -> pd.DataFrame:
    """Generate synthetic behavioural signals for June 2025."""
    rng = np.random.default_rng(seed)
    dates = pd.date_range("2025-06-01", "2025-06-30", freq="D")

    # Social media sentiment: baseline noise around neutral
    sentiment = rng.normal(loc=0.0, scale=0.1, size=len(dates))

    # News count: Poisson with many zeros (weekends will get zero later)
    news_count = rng.poisson(lam=1.0, size=len(dates))

    # Search interest: gradually rising baseline
    svi = np.clip(rng.normal(loc=40, scale=10, size=len(dates)), a_min=0, a_max=None)

    # Inject a major positive event on 2025-06-25
    event_date = pd.Timestamp("2025-06-25")
    idx_event = dates.get_loc(event_date)

    sentiment[idx_event] = 0.71           # large positive sentiment
    news_count[idx_event] = 20            # surge in coverage
    svi[idx_event] = 100                  # peak search interest

    # Elevated follow‑up day (optional realism)
    if idx_event + 1 < len(dates):
        sentiment[idx_event + 1] = np.clip(sentiment[idx_event + 1] + 0.15, -1, 1)
        news_count[idx_event + 1] = max(news_count[idx_event + 1], 5)
        svi[idx_event + 1] = max(svi[idx_event + 1], 76)

    # Set news_count to zero on weekends
    is_weekend = dates.weekday >= 5  # Sat/Sun are 5,6
    news_count[is_weekend] = 0

    df = pd.DataFrame(
        {
            "sentiment": sentiment,
            "news_count": news_count,
            "svi": svi,
        },
        index=dates,
    )
    return df


def eda(df: pd.DataFrame) -> None:
    """Run exploratory data analysis and save outputs."""
    print("Summary statistics:\n", df.describe())

    print("\nPairwise correlations:\n", df.corr(method="pearson"))

    # Time‑series plots
    fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(11, 12), sharex=True)

    df["sentiment"].plot(ax=axes[0], title="Daily Social Sentiment (VADER compound)")
    axes[0].axvline(pd.Timestamp("2025-06-25"), linestyle="--", label="Event")
    axes[0].legend()

    df["news_count"].plot(
        ax=axes[1], title="Daily News Article Count", kind="bar"
    )
    axes[1].axvline(pd.Timestamp("2025-06-25"), linestyle="--")

    df["svi"].plot(ax=axes[2], title="Daily Google Search Volume Index (SVI)")
    axes[2].axvline(pd.Timestamp("2025-06-25"), linestyle="--")

    plt.tight_layout()
    fig.savefig("behavioural_signals_timeseries.png")
    plt.close(fig)

    # Histograms
    fig2 = df.hist(bins=10, figsize=(11, 4), layout=(1, 3))
    plt.tight_layout()
    plt.savefig("behavioural_signals_histograms.png")
    plt.close()

    # Outlier detection
    z_scores = np.abs(zscore(df))
    outliers = df[(z_scores > 3).any(axis=1)]
    if not outliers.empty:
        print("\nOutlier days (z‑score > 3 on any signal):")
        print(outliers)


def main() -> None:
    df = generate_data()
    csv_path = "behavioural_signals_june2025.csv"
    df.to_csv(csv_path, index_label="date")
    print(f"Dataset saved to {csv_path}")
    eda(df)


if __name__ == "__main__":
    main()


Dataset saved to behavioural_signals_june2025.csv
Summary statistics:
        sentiment  news_count         svi
count  30.000000   30.000000   30.000000
mean    0.031776    1.266667   40.889635
std     0.150741    3.703990   15.309668
min    -0.195104    0.000000   22.726796
25%    -0.028340    0.000000   32.035401
50%     0.033508    0.000000   38.480385
75%     0.077096    1.000000   44.721928
max     0.710000   20.000000  100.000000

Pairwise correlations:
             sentiment  news_count       svi
sentiment    1.000000    0.813361  0.655289
news_count   0.813361    1.000000  0.767012
svi          0.655289    0.767012  1.000000

Outlier days (z‑score > 3 on any signal):
            sentiment  news_count    svi
2025-06-25       0.71          20  100.0
