In [2]:
#general purpose packages
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
import emoji
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

#keras
import tensorflow as tf
from tensorflow import keras


#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

# #set style for plots
# sns.set_style("whitegrid")
# sns.despine()
# sns.style.use("whitegrid")
# plt.rc("figure", autolayout=True)
# plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

1. Dependencies

In [4]:
pip install tweepy pymongo streamlit plotly pandas transformers torch scikit-learn reportlab


Collecting tweepy
  Downloading tweepy-4.16.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pymongo
  Downloading pymongo-4.15.0-cp312-cp312-win_amd64.whl.metadata (22 kB)
Collecting plotly
  Downloading plotly-6.3.0-py3-none-any.whl.metadata (8.5 kB)
Collecting reportlab
  Downloading reportlab-4.4.3-py3-none-any.whl.metadata (1.7 kB)
Collecting oauthlib<4,>=3.2.0 (from tweepy)
  Downloading oauthlib-3.3.1-py3-none-any.whl.metadata (7.9 kB)
Collecting requests-oauthlib<3,>=1.2.0 (from tweepy)
  Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.8.0-py3-none-any.whl.metadata (5.7 kB)
Downloading tweepy-4.16.0-py3-none-any.whl (98 kB)
Downloading pymongo-4.15.0-cp312-cp312-win_amd64.whl (909 kB)
   ---------------------------------------- 0.0/910.0 kB ? eta -:--:--
   ---------------------------------------- 910.0/910.0 kB 8.3 MB/s eta 0:00:00
Downloading plotly-6.3.0-py3-none-any.whl (9.

2. Real-Time Tweet Streaming

In [6]:
import tweepy

# Twitter API credentials (use your own keys)
BEARER_TOKEN = 'AAAAAAAAAAAAAAAAAAAAAFXk4AEAAAAAbFmHqXdw92r%2BQVP2WF42gl%2B8j24%3DXVjO9I6u1EhiLYoTkuzNoWVF6qk1N3DIg2xu7CWXee1ZTVe2Bq'

class MyStream(tweepy.StreamingClient):
    def on_tweet(self, tweet):
        # Send tweet data to the next step (preprocessing, DB, etc.)
        process_tweet(tweet)

# Set up streaming for a hashtag
stream = MyStream(BEARER_TOKEN)
stream.add_rules(tweepy.StreamRule("#AI"))
stream.filter(tweet_fields=["created_at", "lang", "geo"])


Forbidden: 403 Forbidden
When authenticating requests to the Twitter API v2 endpoints, you must use keys and tokens from a Twitter developer App that is attached to a Project. You can create a project via the developer portal.

3. Preprocess Tweets

In [None]:
import re

def clean_text(text):
    text = re.sub(r"http\S+|@\S+|#\S+", "", text)
    text = re.sub(r"[^A-Za-z0-9 ]+", "", text)
    return text.strip().lower()

4. BERT Sentiment Classification

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
sentiment_pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

def classify_sentiment(text):
    result = sentiment_pipe(text)
    return result[0]['label'], float(result[0]['score'])

5. Store in MongoDB

In [None]:
from pymongo import MongoClient
from datetime import datetime

client = MongoClient("mongodb://localhost:27017/")
db = client.twitter_sentiment
collection = db.tweets

def save_tweet_to_db(tweet, sentiment, score, location):
    doc = {
        "text": tweet.text,
        "sentiment": sentiment,
        "score": score,
        "created_at": tweet.created_at,
        "location": location
    }
    collection.insert_one(doc)

6. Tie All Steps Together

In [None]:
def process_tweet(tweet):
    clean = clean_text(tweet.text)
    sentiment, score = classify_sentiment(clean)
    loc = tweet.geo or "unknown"
    save_tweet_to_db(tweet, sentiment, score, loc)

7. Streamlit Dashboard Example

In [None]:
import streamlit as st
import pandas as pd
import plotly.express as px

st.title("Twitter Sentiment Dashboard")

df = pd.DataFrame(list(collection.find()))
filter_date = st.date_input("Date")
filter_keyword = st.text_input("Keyword")
filter_location = st.text_input("Location")

# Filtering logic
if filter_date:
    df = df[df['created_at'].dt.date == pd.to_datetime(filter_date).date()]
if filter_keyword:
    df = df[df['text'].str.contains(filter_keyword, case=False)]
if filter_location:
    df = df[df['location'].str.contains(filter_location, case=False)]

# Visualizations
st.plotly_chart(px.histogram(df, x="sentiment", color="sentiment", title="Sentiment Distribution"))
st.dataframe(df[["text", "sentiment", "score", "created_at", "location"]])


8. Auto-Generate Daily Report (CSV/PDF)

In [None]:
import pandas as pd

def export_daily_report(date):
    df = pd.DataFrame(list(collection.find({"created_at": {"$gte": date, "$lt": date + pd.Timedelta("1 day")}})))
    df.to_csv("daily_sentiment.csv", index=False)
    # For PDF, use reportlab/pdfkit etc.

Collecting tf-keras
  Using cached tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.20,>=2.19 (from tf-keras)
  Downloading tensorflow-2.19.1-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cached tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting numpy<2.2.0,>=1.26.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Downloading numpy-2.1.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Using cached tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
Downloading tensorflow-2.19.1-cp312-cp312-win_amd64.whl (376.0 MB)
   ---------------------------------------- 0.0/376.0 MB ? eta -:--:--
   ---------------------------------------- 1.3/376.0 MB 9.6 MB/s eta 0:00:39
   ---------------------------------------- 4.2/376.0 MB 11.4 MB/s eta 0:00:33
    --------------------------------------- 7.1/376.0 MB 12.1 MB/s eta 0:00:31
   - -------------------------------------- 11.3/376.0 MB 14.4 MB/s eta 0:00:2

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.


ModuleNotFoundError: No module named 'dotenv'