In [None]:
# --- Cell 1: Libraries and Data Loading ---
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Load dataset
tweet_df = pd.read_csv('train.csv')

# See first rows
print(tweet_df.head())
print(tweet_df.info())

# Print a few raw tweets
for i in range(5):
    print(tweet_df['tweet'].iloc[i], "\n")


In [None]:
# --- Cell 2: Preprocessing Function ---

# Extend sklearn stopwords with some Twitter junk
stop_words = set(ENGLISH_STOP_WORDS) | {"rt", "amp"}

# Regex pattern for tokenizing words (letters, numbers, underscore, apostrophe)
_token_pat = re.compile(r"[a-z0-9_']+")

def data_processing(text: str) -> str:
    if not isinstance(text, str):
        return ""

    t = text.lower()

    # Remove URLs
    t = re.sub(r'https?://\S+|www\.\S+', ' ', t)

    # Remove @mentions
    t = re.sub(r'@\w+', ' ', t)

    # Keep hashtag word (e.g., #happy -> happy)
    t = re.sub(r'#(\w+)', r'\1', t)

    # Replace &amp;
    t = t.replace('&amp;', ' and ')

    # Remove punctuation/symbols
    t = re.sub(r'[^\w\s]', ' ', t)

    # Collapse spaces
    t = re.sub(r'\s+', ' ', t).strip()

    # Tokenize with regex
    tokens = _token_pat.findall(t)

    # Remove stopwords + very short tokens
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]

    return " ".join(tokens)


In [None]:
# --- Cell 3: Apply Preprocessing ---
tweet_df['tweet'] = tweet_df['tweet'].astype(str).apply(data_processing)

# Drop duplicates
tweet_df = tweet_df.drop_duplicates(subset='tweet').reset_index(drop=True)

# Print cleaned tweets
print(tweet_df.head(), "\n")

for i in range(5):
    print(f"{i}:", tweet_df['tweet'].iloc[i], "\n")


In [None]:
# --- Cell 4: TF-IDF and Train-Test Split ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = tweet_df['tweet']
y = tweet_df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=2)
Xtr = tfidf.fit_transform(X_train)
Xte = tfidf.transform(X_test)


In [None]:
# --- Cell 5: Logistic Regression Training & Evaluation ---
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

clf = LogisticRegression(max_iter=300, solver='liblinear')
clf.fit(Xtr, y_train)

pred = clf.predict(Xte)

print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

ConfusionMatrixDisplay.from_predictions(y_test, pred)
plt.show()


In [None]:
# Your existing objects:
# - data_processing  (the cleaner we wrote)
# - tfidf            (fitted TfidfVectorizer)
# - clf              (fitted LogisticRegression)

def predict_tweet(text, threshold=0.5):
    clean = data_processing(text)
    X = tfidf.transform([clean])
    prob1 = clf.predict_proba(X)[0, 1]
    pred = int(prob1 >= threshold)  # 0 or 1
    return {"input": text, "cleaned": clean, "pred": pred, "prob_class1": prob1}

# Try it:
print(predict_tweet("@user thanks for #lyft credit i can't use… #disappointed"))
