In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv


<div style="background-color:#1e1e1e; color:#d4d4d4; padding:25px; border-radius:10px; font-family:Segoe UI, sans-serif; line-height:1.6;">

<h1 style="color:#61dafb; text-align:center;">📡 Real-Time Sentiment Analysis with Online Learning</h1>

<h2 style="color:#56b6c2;">1️⃣ Motivation</h2>
<p>
In the age of social media, data arrives continuously — tweets, comments, and posts are generated every second. 
Traditional <b>batch learning</b> models require retraining from scratch when new data arrives, which:
</p>
<ul>
<li>Consumes significant resources (CPU, memory, time).</li>
<li>Cannot adapt fast enough to sudden changes in trends or topics.</li>
<li>Is impractical for endless data streams.</li>
</ul>
<p>
<b>Online Learning</b> (or Incremental Learning) solves this by updating the model immediately as new data arrives, without reprocessing all past data. 
This makes it ideal for <span style="color:#98c379;">real-time sentiment analysis</span>, fraud detection, and dynamic recommendation systems.
</p>

<h2 style="color:#56b6c2;">2️⃣ Project Goal</h2>
<p>
We will build an <b>online sentiment analysis system</b> that processes tweets in small batches, 
predicts their sentiment (<span style="color:#98c379;">Positive</span> / <span style="color:#e06c75;">Negative</span>), 
and continuously improves as new data arrives.
</p>

<h2 style="color:#56b6c2;">3️⃣ Dataset — Sentiment140</h2>
<ul>
<li><b>Source:</b> <a href="https://www.kaggle.com/datasets/kazanova/sentiment140" style="color:#61dafb;">Sentiment140 Dataset</a></li>
<li><b>Size:</b> 1.6 million labeled tweets</li>
<li><b>Labels:</b> 0 = Negative, 4 = Positive (will convert 4 → 1 for binary classification)</li>
<li><b>Content:</b> Raw tweet text, no emojis, collected via Twitter API</li>
</ul>

<h2 style="color:#56b6c2;">4️⃣ Approach</h2>
<ol>
<li><b>Load and preprocess</b> the Sentiment140 dataset.</li>
<li>Use <code>HashingVectorizer</code> for text-to-vector transformation (efficient in streaming scenarios).</li>
<li>Train an <b>SGDClassifier</b> model incrementally using <code>partial_fit</code>.</li>
<li>Simulate a stream of tweets in <b>mini-batches</b> (batch size: 10,000 rows).</li>
<li>Record and plot <b>accuracy over time</b> to see model improvement.</li>
</ol>

<h2 style="color:#56b6c2;">5️⃣ Why Online Learning for This Task?</h2>
<ul>
<li>Handles <b>continuous inflow</b> of social media data.</li>
<li>Works within <b>limited memory</b> — no need to store all past tweets.</li>
<li><b>Fast adaptation</b> to changing topics, slang, and trends.</li>
</ul>

<h2 style="color:#56b6c2;">6️⃣ Expected Outcome</h2>
<p>
By the end of this notebook, we’ll have a working real-time sentiment classifier that can learn from new tweets without full retraining. 
This approach can be extended to live Twitter APIs and other streaming data sources.
</p>

<hr style="border: 1px solid #333; margin: 20px 0;">
<p style="text-align:center; color:#888;">
<strong>Author's Note:</strong> This notebook uses a simulated streaming setup on the Sentiment140 dataset. The same code can be adapted to handle live Twitter data.
</p>

</div>


<div style="background-color:#1e1e1e; color:#d4d4d4; padding:20px; border-radius:8px; font-family:Segoe UI;">
<h2 style="color:#56b6c2;">Step 1 — Import Libraries & Load Dataset</h2>
<p>We start by importing the core Python libraries for online learning and sentiment analysis:</p>
<ul>
<li><b>pandas</b> — to handle tabular data</li>
<li><b>numpy</b> — for numeric operations</li>
<li><b>sklearn</b> — for vectorization and the online learning model</li>
<li><b>matplotlib</b> — for accuracy visualization</li>
</ul>
<p>We then load a labeled Twitter dataset, where each sample contains the text of the tweet and a label:
<span style="color:#98c379;">1 for positive</span> and <span style="color:#e06c75;">0 for negative</span>.</p>
</div>


In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

<div style="background-color:#1e1e1e; color:#d4d4d4; padding:25px; border-radius:10px; font-family:Segoe UI, sans-serif; line-height:1.6;">

<h2 style="color:#56b6c2;">Step 2 — Load & Preprocess the Dataset</h2>

<p>
In this step, we load the <b>Sentiment140 dataset</b> and prepare it for streaming-based online learning. 
Since the dataset includes extra columns we don’t need, we will:
</p>

<ol>
<li>Read the CSV with <code>pandas</code> using the correct encoding (<code>latin-1</code>) to handle special characters.</li>
<li>Keep only the <b>label</b> and <b>tweet text</b> columns.</li>
<li>Convert labels:
    <span style="color:#e06c75;">0</span> → Negative,
    <span style="color:#98c379;">4</span> → Positive → mapped to <span style="color:#98c379;">1</span> for binary classification.</li>
<li>Shuffle the dataset to simulate random arrival of tweets in a data stream.</li>
<li>Split into training and test datasets, where the test set remains fixed for evaluation across streaming batches.</li>
</ol>

<p>
This preprocessing ensures our model sees tweets in a mixed, unpredictable order — closer to a real online environment.
</p>

</div>


In [4]:
# Load Sentiment140 dataset
# Kaggle path: /kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv
# Columns: target, ids, date, flag, user, text

df = pd.read_csv(
    "/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    header=None
)

# Keep only sentiment and text
df = df[[0, 5]]
df.columns = ['label', 'text']

# Map labels: 0 -> 0 (negative), 4 -> 1 (positive)
df['label'] = df['label'].replace({4: 1})

# Shuffle dataset to simulate random streaming
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Train-test split (e.g., 90% train for streaming simulation, 10% fixed test set)
train_size = int(len(df) * 0.9)
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

# Show dataset info
print(train_df.head())
print(train_df['label'].value_counts())


   label                                               text
0      0             @chrishasboobs AHHH I HOPE YOUR OK!!! 
1      0  @misstoriblack cool , i have no tweet apps  fo...
2      0  @TiannaChaos i know  just family drama. its la...
3      0  School email won't open  and I have geography ...
4      0                             upper airways problem 
label
1    720035
0    719965
Name: count, dtype: int64
