In [1]:
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv("si670_kaggle1_train.csv")
valid = pd.read_csv("si670_kaggle1_validation.csv")
test = pd.read_csv("test.csv")

In [3]:
train['label'].value_counts(normalize=True)


label
0    0.707532
1    0.292468
Name: proportion, dtype: float64

In [4]:
valid['label'].value_counts(normalize=True)

label
1    0.507096
0    0.492904
Name: proportion, dtype: float64

Sampling the Dataset

In [5]:

sample_size = 90000

if len(train) > sample_size:
    print(f"Sampling {sample_size} examples from training data for faster training...")

    class_counts = train['label'].value_counts()
    print(f"Original class distribution: {train['label'].value_counts(normalize=True)}")


    try:
        train_sampled = train.sample(n=sample_size, random_state=42, stratify=train['label'])
    except:
        samples_per_class = sample_size // 2  

        class_0_samples = train[train['label'] == 0].sample(n=samples_per_class, random_state=42, replace=False)
        class_1_samples = train[train['label'] == 1].sample(n=samples_per_class, random_state=42, replace=False)

        train_sampled = pd.concat([class_0_samples, class_1_samples]).sample(frac=1, random_state=42).reset_index(drop=True)
        print(f" Created balanced sample: {samples_per_class} samples per class")

else:
    train_sampled = train

print(f"Using {len(train_sampled)} training samples")
print(f"Class distribution in sampled training data:")
print(train_sampled['label'].value_counts(normalize=True))

Sampling 90000 examples from training data for faster training...
Original class distribution: label
0    0.707532
1    0.292468
Name: proportion, dtype: float64
 Created balanced sample: 45000 samples per class
Using 90000 training samples
Class distribution in sampled training data:
label
1    0.5
0    0.5
Name: proportion, dtype: float64


Creating the Pipeline

In [6]:
pipeline = Pipeline([
    ("features", FeatureUnion([
        ('word_tfidf', TfidfVectorizer(
            stop_words='english',
            max_features=15000,  
            ngram_range=(1, 2), 
            min_df=3,
            max_df=0.9,
            sublinear_tf=True
        )),
        ('char_tfidf', TfidfVectorizer(
            analyzer='char',
            ngram_range=(3, 5),  
            max_features=10000,  
            min_df=3,
            max_df=0.95
        ))
    ])),
    ("classifier", LogisticRegression(
        C=1.0,
        max_iter=500, 
        random_state=42,
        class_weight='balanced',
        solver='liblinear',
        n_jobs=-1
    ))
])


pipeline.fit(train_sampled["text"], train_sampled["label"])

0,1,2
,steps,"[('features', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('word_tfidf', ...), ('char_tfidf', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'char'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,500


Predicting our trained model on the validation set

In [7]:
valid_preds = pipeline.predict(valid["text"])
f1 = f1_score(valid["label"], valid_preds)
print(f"\nValidation F1 Score: {f1:.4f}")
print(classification_report(valid["label"], valid_preds))


Validation F1 Score: 0.7117
              precision    recall  f1-score   support

           0       0.70      0.76      0.73     27993
           1       0.75      0.68      0.71     28799

    accuracy                           0.72     56792
   macro avg       0.72      0.72      0.72     56792
weighted avg       0.72      0.72      0.72     56792



Parameter Tuning

In [None]:
from sklearn.model_selection import StratifiedKFold
param_grid = {
    "word_max_features": [15000, 30000],
    "word_ngram_range": [(1,2), (1,3)],
    "char_max_features": [10000, 20000],
    "char_ngram_range": [(3,5), (4,6)],
    "C": [0.1, 1, 10],
    "solver": ['liblinear', 'saga']
}

best_f1 = 0
best_params = None
best_model = None

for w_feat in param_grid["word_max_features"]:
    for w_ngram in param_grid["word_ngram_range"]:
        for c_feat in param_grid["char_max_features"]:
            for c_ngram in param_grid["char_ngram_range"]:
                for c in param_grid["C"]:
                    for solv in param_grid["solver"]:
                        print(f"\nTesting config: word_feats={w_feat}, word_ngrams={w_ngram}, "
                            f"char_feats={c_feat}, char_ngrams={c_ngram}, c={c}, solver={solv}")

                        pipeline = Pipeline([
                        ("features", FeatureUnion([
                                ("word_tfidf", TfidfVectorizer(
                                            stop_words="english",
                                            max_features=w_feat,
                                            ngram_range=w_ngram,
                                            min_df=2,
                                            sublinear_tf=True
                            )),
                                        ("char_tfidf", TfidfVectorizer(
                                            analyzer="char",
                                            max_features=c_feat,
                                            ngram_range=c_ngram,
                                            min_df=2
                                        ))
                                    ])),
                                    ("classifier", LogisticRegression(
                                        C=c,
                                        max_iter=500,
                                        random_state=42,
                                        class_weight="balanced",
                                        solver=solv
                                    ))
                                ])

                        pipeline.fit(train_sampled["text"], train_sampled["label"])

                        preds = pipeline.predict(valid["text"])
                        val_f1 = f1_score(valid["label"], preds)

                        print(f"Validation F1: {val_f1:.4f}")

                        if val_f1 > best_f1:
                            best_f1 = val_f1
                            best_params = (w_feat, w_ngram, c_feat, c_ngram)

print("\nBest Config:", best_params)
print("Best Validation F1:", best_f1)


In [None]:
param_grid = {
    "min_df": [1, 2],
    "max_df": [0.8, 0.9, 1.0],
    "sublinear_tf": [True, False],
    "lowercase": [True, False]
}

best_f1 = 0
best_params = None


for min_df in param_grid["min_df"]:
    for max_df in param_grid["max_df"]:
        for sublinear in param_grid["sublinear_tf"]:
            for lower in param_grid["lowercase"]:

                print(f"\nTesting config: min_df={min_df}, max_df={max_df}, "
                      f"sublinear_tf={sublinear}, lowercase={lower}")

                pipeline = Pipeline([
                    ("features", FeatureUnion([
                        ("word_tfidf", TfidfVectorizer(
                            stop_words="english",
                            max_features=50000,
                            ngram_range=(1, 3),
                            min_df=min_df,
                            max_df=max_df,
                            sublinear_tf=sublinear,
                            lowercase=lower
                        )),
                        ("char_tfidf", TfidfVectorizer(
                            analyzer="char",
                            max_features=20000,
                            ngram_range=(3, 5),
                            min_df=min_df,
                            max_df=max_df,
                            sublinear_tf=sublinear,
                            lowercase=lower
                        ))
                    ])),
                    ("classifier", LogisticRegression(
                        C=10,
                        max_iter=500,
                        random_state=42,
                        class_weight="balanced",
                        solver="liblinear"
                    ))
                ])
                pipeline.fit(train_sampled["text"], train_sampled["label"])
                preds = pipeline.predict(valid["text"])
                val_f1 = f1_score(valid["label"], preds)

                print(f"Validation F1: {val_f1:.4f}")

                if val_f1 > best_f1:
                    best_f1 = val_f1
                    best_params = (min_df, max_df, sublinear, lower)

print("\nBest Config:", best_params)
print("Best Validation F1:", best_f1)

In [8]:
best_params = (50000, (1, 3), 20000, (3, 5))
w_feat, w_ngram, c_feat, c_ngram = best_params

final_pipeline = Pipeline([
    ("features", FeatureUnion([
        ("word_tfidf", TfidfVectorizer(
            stop_words="english",
            max_features=w_feat,
            ngram_range=w_ngram,
            min_df=2,
            max_df = 1.0,
            sublinear_tf=True,
            lowercase = False
        )),
        ("char_tfidf", TfidfVectorizer(
            analyzer="char",
            max_features=c_feat,
            ngram_range=c_ngram,
            min_df=2,
            max_df = 1.0,
            sublinear_tf=True,
            lowercase = False
        ))
    ])),
    ("classifier", LogisticRegression(
        C=10,
        max_iter=500,
        random_state=42,
        class_weight="balanced",
        solver="liblinear"
    ))
])
final_pipeline.fit(train["text"], train["label"])
final_preds = final_pipeline.predict(valid["text"])
final_f1 = f1_score(valid["label"], final_preds)
print(f"\n Final Validation F1: {final_f1:.4f}")


 Final Validation F1: 0.7898


Retraining the model on our complete dataset (Train + Validation)

In [12]:
best_params = (2, 1.0, True, False)
min_df, max_df, sublinear, lower = best_params  

full_train = pd.concat([train, valid]).reset_index(drop=True)
print(f"Combined training size: {len(full_train)}")

final_pipeline = Pipeline([
    ("features", FeatureUnion([
        ("word_tfidf", TfidfVectorizer(
            stop_words="english",
            max_features=50000,     
            ngram_range=(1, 3),  
            min_df=min_df,
            max_df=max_df,
            sublinear_tf=sublinear,
            lowercase = lower
        )),
        ("char_tfidf", TfidfVectorizer(
            analyzer="char",
            max_features=20000,   
            ngram_range=(3, 5),   
            min_df=min_df,
            max_df=max_df,
            sublinear_tf=sublinear,
            lowercase = lower
        ))
    ])),
    ("classifier", LogisticRegression(
        C=10,
        max_iter=500,
        random_state=42,
        class_weight="balanced",
        solver="liblinear"
    ))
])

final_pipeline.fit(full_train["text"], full_train["label"])

Combined training size: 375863


0,1,2
,steps,"[('features', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer_list,"[('word_tfidf', ...), ('char_tfidf', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,
,analyzer,'char'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'liblinear'
,max_iter,500


In [13]:
test_preds = final_pipeline.predict(test["text"])  

submission = pd.DataFrame({
    "id": test["id"],
    "label": test_preds
})

submission.to_csv("submission.csv", index=False)