In [2]:
import pandas as pd

# Load the CSV file (you may need to specify encoding="ISO-8859-1")
df = pd.read_csv('/Users/amritminocha/Desktop/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None)

# Assign column names
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

# Quick look
print(df.head())

   target          id                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [3]:
print(df['target'].value_counts())

target
0    800000
4    800000
Name: count, dtype: int64


### Clean the tweet text for modeling.
#### - remove URLs
#### - remove mentions
#### - remove hashtags
#### - remove special characters and numbers

### Convert target to binary

In [8]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)  
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in stop_words]
    return " ".join(text)

df['clean_text'] = df['text'].apply(clean_text)

df['label'] = df['target'].apply(lambda x: 0 if x == 0 else 1)

print(df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amritminocha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   target          id                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  \
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1    scotthamilton  is upset that he can't update his Facebook by ...   
2         mattycus  @Kenichan I dived many times for the ball. Man...   
3          ElleCTF    my whole body feels itchy and like its on fire    
4           Karoli  @nationwideclass no, it's not behaving at all....   

                                          clean_text  label  
0  awww thats bummer shoulda got david carr third...      0  
1  upset cant update facebook texting might cr

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])

# Labels
y = df['label']

print(X.shape)

(1600000, 5000)


In [20]:
# The output value -> (0, 1072) means 0th row 1072 column which means 1072th column words out of 5000 features has tf-idf value 0.2 something
print(X)

  (0, 1072)	0.2022059522593894
  (0, 4355)	0.43525088314850735
  (0, 1070)	0.38502835636990573
  (0, 1798)	0.21798867276958758
  (0, 3824)	0.478554381481773
  (0, 599)	0.4067014663331926
  (0, 4332)	0.25192085396416264
  (0, 276)	0.34061566245484975
  (1, 441)	0.3459118204500027
  (1, 114)	0.25579893757830585
  (1, 4413)	0.17851966325254992
  (1, 3697)	0.22942534573460108
  (1, 3555)	0.3745153726427564
  (1, 994)	0.2925753923967987
  (1, 2728)	0.2586727297713326
  (1, 4318)	0.36723428155196647
  (1, 1460)	0.29403925773138545
  (1, 4616)	0.2935998272393082
  (1, 656)	0.1797308481873239
  (1, 4627)	0.3127440037554583
  (2, 1768)	0.22506485250168215
  (2, 3551)	0.36703910640206716
  (2, 3680)	0.41458284571599624
  (2, 2621)	0.4579295443870856
  (2, 309)	0.444421378783451
  :	:
  (1599994, 4948)	0.2963413177925768
  (1599994, 4856)	0.4088594085064174
  (1599994, 4708)	0.34076110574742496
  (1599994, 1783)	0.21795704505051744
  (1599994, 4874)	0.2374719009665092
  (1599994, 4396)	0.24336775

In [18]:
dense_sample = X[:5].todense()
print(dense_sample)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
print(X, y)

  (0, 1072)	0.2022059522593894
  (0, 4355)	0.43525088314850735
  (0, 1070)	0.38502835636990573
  (0, 1798)	0.21798867276958758
  (0, 3824)	0.478554381481773
  (0, 599)	0.4067014663331926
  (0, 4332)	0.25192085396416264
  (0, 276)	0.34061566245484975
  (1, 441)	0.3459118204500027
  (1, 114)	0.25579893757830585
  (1, 4413)	0.17851966325254992
  (1, 3697)	0.22942534573460108
  (1, 3555)	0.3745153726427564
  (1, 994)	0.2925753923967987
  (1, 2728)	0.2586727297713326
  (1, 4318)	0.36723428155196647
  (1, 1460)	0.29403925773138545
  (1, 4616)	0.2935998272393082
  (1, 656)	0.1797308481873239
  (1, 4627)	0.3127440037554583
  (2, 1768)	0.22506485250168215
  (2, 3551)	0.36703910640206716
  (2, 3680)	0.41458284571599624
  (2, 2621)	0.4579295443870856
  (2, 309)	0.444421378783451
  :	:
  (1599994, 4948)	0.2963413177925768
  (1599994, 4856)	0.4088594085064174
  (1599994, 4708)	0.34076110574742496
  (1599994, 1783)	0.21795704505051744
  (1599994, 4874)	0.2374719009665092
  (1599994, 4396)	0.24336775

### Splitting data into training and testing set

Training set (X_train, y_train) → Used to fit the model.

Testing set (X_test, y_test) → Used to evaluate performance on new, unseen data.

This helps prevent overfitting (model doing well only on training data) and gives a realistic measure of model accuracy.

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### Model Training (Logistic Regression)
Regression - type of supervised learning (learning is done on labelled data, so we know the ans before) - models the relationship between input features (X) and a continuous output (Y) - Eg. Predicting house price based on area, number of rooms, etc.

Linear Regression -  models a straight-line relationship between inputs and output

Logistic Regression - used for classification, not regression - called “regression” because it’s based on a linear model, but it uses a logistic (sigmoid) function to convert the result into a probability between 0 and 1 - 
If sigmoid(z) > 0.5 → class 1 (positive sentiment)

If sigmoid(z) < 0.5 → class 0 (negative sentiment)

### How LogisticRegression.fit(X_train, y_train) Works

1. Input:

X_train: A matrix of numerical features, in your case a TF-IDF matrix (sparse).
Each row = one tweet,
Each column = one word/token (up to 5000 if max_features=5000)
Each cell = importance of that word in that tweet (the TF-IDF score)

y_train: A vector of labels (0 for negative, 1 for positive)

✔️ Think of one row in X_train as:
```
Tweet → [0, 0.3, 0, 0, 0.7, 0, ..., 0.5]  → y = 1
```

2. Model Goal: Learn a weight for each feature/word.

z = w1x1 + w2x2 + w3x3 + ... + b (bias)

then apply sigmoid function:

sigmoid(z) = 1/1+e(pow(-z))

This gives a probability between 0 and 1 that the tweet is positive.


### Example

Tweet → [0, 0.3, 0, 0, 0.7, 0, ..., 0.5] → y = 1

Let’s assume this tweet has only 3 non-zero TF-IDF values for simplicity:

Feature Index	TF-IDF Value (x)
x₁	0.3
x₂	0.7
x₃	0.5

Model Will Learn Weights:
The model learns one weight (w) per feature. Example:

Feature Index	Weight (w)
w₁	1.2
w₂	-0.8
w₃	0.5

Also, there's a bias term (b). Let’s say:
b = 0.3

z=0.36−0.56+0.25+0.3=0.35

sigmoid(0.35) = 1/1+e(-35) = 0.587

### Note
- TF-IDF tells us how important a word is in a tweet

- For weights:
The model’s job is to learn how important each word (feature) is for predicting the sentiment.

So during training, Logistic Regression tries many combinations of weights w₁, w₂, ..., wₙ and bias b that best separate positive and negative tweets.

For example, maybe:

- w_love = +1.2 → because the word “love” appears in positive tweets

- w_bad = -1.0 → because “bad” shows up in negative tweets

These weights are initialized randomly and updated by the model during training using gradient descent, minimizing error on predictions.

In [24]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.774559375
Precision: 0.762693823365844
Recall: 0.79714375
F1-score: 0.779538363276869

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77    160000
           1       0.76      0.80      0.78    160000

    accuracy                           0.77    320000
   macro avg       0.78      0.77      0.77    320000
weighted avg       0.78      0.77      0.77    320000

