In [2]:
import pandas as pd


COLS = ["target", "id", "date", "query", "user", "text"]

df = pd.read_csv("abhi.csv", encoding="latin-1", names=COLS)

print(df.head())
print(df.info())
print(df.describe())

   target          id                          date     query  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------

In [3]:
missing = df.isnull().sum().sort_values(ascending=False)
print(missing[missing > 0])

Series([], dtype: int64)


In [4]:
#irrelevent columns
drop_cols = [
    'id',
    'date',
    'query',
    'user',
]

df.drop(columns=drop_cols, inplace=True)

In [5]:
df["target"] = df["target"].map({0:0, 4:1})

In [6]:
df.to_csv("abhi_minimal.csv",index = False)

In [7]:
df["target"].unique()

array([0, 1], dtype=int64)

In [8]:
df = pd.read_csv("abhi_minimal.csv")
df = df.rename(columns={"target": "label"})

df.to_csv("sentiment140_labeled.csv", index=False)

In [9]:
df["label"].unique()
df["label"].value_counts()

label
0    800000
1    800000
Name: count, dtype: int64

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

tqdm.pandas()
nltk.download("stopwords")
STOPWORDS = set(stopwords.words("english"))

URL_PATTERN = re.compile(r"https?://\S+|www\.\S+")
MENTION_PATTERN = re.compile(r"@\w+")
HASHTAG_PATTERN = re.compile(r"#(\w+)")
RT_PATTERN = re.compile(r"\brt\b", re.IGNORECASE)
NON_ALNUM_PATTERN = re.compile(r"[^a-z0-9\s\.\,\!\?\']")


def clean_tweet(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text.strip()

    # Remove URLs and mentions
    t = URL_PATTERN.sub(" ", t)
    
    t = MENTION_PATTERN.sub(" ", t)

    # Keep hashtag words (drop '#')
    t = HASHTAG_PATTERN.sub(r"\1", t)


    # Remove RT markers
    t = RT_PATTERN.sub(" ", t)

    # Lowercase
    t = t.lower()


    # Remove special characters (keep basic punctuation)
    t = NON_ALNUM_PATTERN.sub(" ", t)

    # Normalize whitespace
    t = re.sub(r"\s+", " ", t).strip()

    # Remove stopwords (light)
    tokens = [w for w in t.split() if w not in STOPWORDS]
    t = " ".join(tokens)

    return t

if __name__ == "__main__":
    df = pd.read_csv("sentiment140_labeled.csv")
    df["clean_text"] = df["text"].progress_apply(clean_tweet)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1600000/1600000 [00:59<00:00, 26809.79it/s]


In [11]:
# Count empty rows in the 'text' column
empty_rows = (df["clean_text"].str.strip() == "").sum()
print("Empty rows in 'text':", empty_rows)

Empty rows in 'text': 4900


In [12]:
# Drop empty rows post-cleaning
df = df[df["clean_text"].str.len() > 0].copy()


df.to_csv("sentiment140_clean.csv", index=False)
print("Saved:", "sentiment140_clean.csv", "Shape:", df.shape)

Saved: sentiment140_clean.csv Shape: (1595100, 3)


In [13]:
#train test split
from sklearn.model_selection import train_test_split
df = pd.read_csv("sentiment140_clean.csv")

# 80/20 stratified split
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

# Save and check distribution
for name, part in [("train", train_df), ("test", test_df)]:
    part.to_csv(f"{name}.csv", index=False)
    print(name, part["label"].value_counts(normalize=True).round(3).to_dict())

train {1: 0.5, 0: 0.5}
test {1: 0.5, 0: 0.5}


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

X_train, y_train = train_df["clean_text"], train_df["label"]
X_test, y_test = test_df["clean_text"], test_df["label"]

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,3),  # capture bigrams
        min_df=5,
        max_df=0.9,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

pipe.fit(X_train, y_train)

print("Test report:")
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix (test):")
print(confusion_matrix(y_test, y_pred))

Test report:
              precision    recall  f1-score   support

           0     0.8018    0.7744    0.7879    159492
           1     0.7819    0.8086    0.7950    159528

    accuracy                         0.7915    319020
   macro avg     0.7918    0.7915    0.7914    319020
weighted avg     0.7918    0.7915    0.7914    319020

Confusion matrix (test):
[[123510  35982]
 [ 30531 128997]]


In [15]:
print("Training data:", X_train.shape[0])
print("test data:",X_test.shape[0])

Training data: 1276080
test data: 319020


In [16]:
import pickle as pk
with open('pipe.pkl','wb')as f:
    pk.dump(pipe, f)