In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [20]:
#load the dataset and clean basic columns

df = pd.read_csv("train.En.csv")

if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

df["tweet"] = df["tweet"].astype(str).fillna("")
texts = df["tweet"].str.lower()
labels = df["sarcastic"]

In [21]:
#split the data into training and validation sets (stratified)

X_train_text, X_val_text, y_train, y_val = train_test_split(
    texts,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

In [22]:
#initialize the TF-IDF vectorizer with chosen parameters

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_features=20000,
    sublinear_tf=True
)

In [23]:
#fit TF-IDF on training data and transform both train and validation sets

X_train = vectorizer.fit_transform(X_train_text)
X_val = vectorizer.transform(X_val_text)

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)


Train shape: (2774, 7565)
Val shape: (694, 7565)


In [24]:
#check the number of features and sample vocabulary terms

feature_names = vectorizer.get_feature_names_out()
print("num features:", len(feature_names))
print("first 20 features:", feature_names[:20])

num features: 7565
first 20 features: ['00' '000' '10' '10 10' '10 mins' '10 minutes' '10 years' '100' '1000'
 '10pm' '10pm coincidentally' '10th' '11' '11 at' '12' '12 hours' '12 hr'
 '13' '13 year' '14']


In [25]:
#sample tweet and its corresponding TF-IDF representation

print("Tweet:", X_train_text.iloc[0])
print("Label:", y_train.iloc[0])
print("TF-IDF row:", X_train[0])

Tweet: @igreen95 thx for the play by play
Label: 1
TF-IDF row:   (0, 6456)	0.5392309318312463
  (0, 2190)	0.18185215580127118
  (0, 6038)	0.12723830800107794
  (0, 4835)	0.6917826303949002
  (0, 1102)	0.3065411783675111
  (0, 2241)	0.2957017193268068
