In [2]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

train_texts = pd.read_pickle('train.pkl')
labels = pd.read_pickle('labels.pkl')
df = pd.DataFrame({
    'text': train_texts,
    'label': labels
})

print("Combined DataFrame shape:", df.shape)
print("Class distribution:\n", df['label'].value_counts())

def check_model_metrics(model, test_data, test_targets):
    y_pred = model.predict(test_data)

    print("ACCURACY:")
    print(metrics.accuracy_score(test_targets, y_pred) * 100)

    print("\nCONFUSION MATRIX")
    print(confusion_matrix(test_targets, y_pred))

    print("\nCLASSIFICATION REPORT")
    print(classification_report(test_targets, y_pred))

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

print("\nTraining Data Shape:", df_train.shape)
print("Test Data Shape:", df_test.shape)

def train_full_data(df_train, df_test):
    vectorizer = HashingVectorizer(n_features=2**18)
    X_train = vectorizer.fit_transform(df_train['text'])
    y_train = df_train['label']

    X_test = vectorizer.transform(df_test['text'])
    y_test = df_test['label']

    start_time = time.time()

    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_train, y_train)

    end_time = time.time()
    print("Direct training time (s):", end_time - start_time)

    # Evaluate the model
    check_model_metrics(clf, X_test, y_test)

def train_downsampled_data(df_train, df_test):
    df_downsampled = resample(
        df_train,
        replace=False,
        n_samples=100000,
        stratify=df_train['label'],
        random_state=42
    )

    vectorizer = HashingVectorizer(n_features=2**8)

    X_train = vectorizer.fit_transform(df_downsampled['text'])
    y_train = df_downsampled['label']

    X_test = vectorizer.transform(df_test['text'])
    y_test = df_test['label']

    start_time = time.time()
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_train, y_train)

    end_time = time.time()
    print("Downsampled training time (s):", end_time - start_time)

    check_model_metrics(clf, X_test, y_test)

def train_with_coreset(df_train, df_test):
    df_sampled = resample(
        df_train,
        replace=False,
        n_samples=100000,
        stratify=df_train['label'],
        random_state=42
    )

    vectorizer = HashingVectorizer(n_features=2**10)
    X_sampled = vectorizer.fit_transform(df_sampled['text'])
    y_sampled = df_sampled['label']

    X_test = vectorizer.transform(df_test['text'])
    y_test = df_test['label']

    n_clusters = 1000
    print(f"Constructing coreset with {n_clusters} clusters...")
    start_time = time.time()

    kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10000, random_state=42)
    kmeans.fit(X_sampled)

    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X_sampled)
    coreset_indices = np.unique(closest)

    X_coreset = X_sampled[coreset_indices]
    y_coreset = y_sampled.iloc[coreset_indices].reset_index(drop=True)

    coreset_construction_time = time.time() - start_time
    print(f"Coreset construction time (s): {coreset_construction_time}")
    print(f"Coreset size: {X_coreset.shape[0]}")
    print(f"Unique classes in coreset: {y_coreset.nunique()}")

    if y_coreset.nunique() < 2:
        raise ValueError("Coreset contains only one class. Adjust the number of clusters or check data diversity.")

    start_time = time.time()
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_coreset, y_coreset)
    end_time = time.time()
    print(f"Coreset training time (s): {end_time - start_time}")

    check_model_metrics(clf, X_test, y_test)

print("\nExperiment 1: Direct HashingVectorizer")
train_full_data(df_train, df_test)

print("\nExperiment 2: Stratified Downsampling and Reduced Features")
train_downsampled_data(df_train, df_test)

print("\nExperiment 3: Proper Coreset Construction")
train_with_coreset(df_train, df_test)


Combined DataFrame shape: (1517041, 2)
Class distribution:
 label
0    763629
4    753412
Name: count, dtype: int64

Training Data Shape: (1213632, 2)
Test Data Shape: (303409, 2)

Experiment 1: Direct HashingVectorizer
Direct training time (s): 23.444284915924072
ACCURACY:
77.68292964282536

CONFUSION MATRIX
[[116102  36624]
 [ 31088 119595]]

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.79      0.76      0.77    152726
           4       0.77      0.79      0.78    150683

    accuracy                           0.78    303409
   macro avg       0.78      0.78      0.78    303409
weighted avg       0.78      0.78      0.78    303409


Experiment 2: Stratified Downsampling and Reduced Features
Downsampled training time (s): 0.9366509914398193
ACCURACY:
64.94303069454104

CONFUSION MATRIX
[[98398 54328]
 [52038 98645]]

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.65      0.64      

In [6]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.cluster import MiniBatchKMeans
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

train_texts = pd.read_pickle('train.pkl')
labels = pd.read_pickle('labels.pkl')
df = pd.DataFrame({'text': train_texts, 'label': labels})

print("Combined DataFrame shape:", df.shape)
print("Class distribution:\n", df['label'].value_counts())

def check_model_metrics(model, test_data, test_targets):
    y_pred = model.predict(test_data)
    print("ACCURACY:")
    print(metrics.accuracy_score(test_targets, y_pred) * 100)
    print("\nCONFUSION MATRIX")
    print(confusion_matrix(test_targets, y_pred))
    print("\nCLASSIFICATION REPORT")
    print(classification_report(test_targets, y_pred))

df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

print("\nTraining Data Shape:", df_train.shape)
print("Test Data Shape:", df_test.shape)

def train_full_data(df_train, df_test):
    vectorizer = HashingVectorizer(n_features=2**18)
    X_train = vectorizer.fit_transform(df_train['text'])
    y_train = df_train['label']
    X_test = vectorizer.transform(df_test['text'])
    y_test = df_test['label']
    print("\nExperiment 1: Direct HashingVectorizer")
    print("Training data shape:", X_train.shape)
    print("Number of features:", X_train.shape[1])
    start_time = time.time()
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_train, y_train)
    end_time = time.time()
    print("Direct training time (s):", end_time - start_time)
    check_model_metrics(clf, X_test, y_test)

def train_downsampled_data(df_train, df_test):
    df_downsampled = resample(df_train, replace=False, n_samples=100000, stratify=df_train['label'], random_state=42)
    vectorizer = HashingVectorizer(n_features=2**10)
    X_train = vectorizer.fit_transform(df_downsampled['text'])
    y_train = df_downsampled['label']
    X_test = vectorizer.transform(df_test['text'])
    y_test = df_test['label']
    print("\nExperiment 2: Stratified Downsampling and Reduced Features")
    print("Downsampled training data shape:", X_train.shape)
    print("Number of features:", X_train.shape[1])
    start_time = time.time()
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_train, y_train)
    end_time = time.time()
    print("Downsampled training time (s):", end_time - start_time)
    check_model_metrics(clf, X_test, y_test)

def train_with_coreset(df_train, df_test):
    df_sampled = resample(df_train, replace=False, n_samples=100000, stratify=df_train['label'], random_state=42)
    vectorizer = HashingVectorizer(n_features=2**10)
    X_sampled = vectorizer.fit_transform(df_sampled['text'])
    y_sampled = df_sampled['label']
    X_test = vectorizer.transform(df_test['text'])
    y_test = df_test['label']
    n_clusters = 1000
    print("\nExperiment 3: Proper Coreset Construction")
    print(f"Constructing coreset with {n_clusters} clusters...")
    print("Sampled data shape before coreset construction:", X_sampled.shape)
    print("Number of features:", X_sampled.shape[1])
    start_time = time.time()
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10000, random_state=42)
    kmeans.fit(X_sampled)
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X_sampled)
    coreset_indices = np.unique(closest)
    X_coreset = X_sampled[coreset_indices]
    y_coreset = y_sampled.iloc[coreset_indices].reset_index(drop=True)
    print("Coreset data shape:", X_coreset.shape)
    print("Number of features:", X_coreset.shape[1])
    coreset_construction_time = time.time() - start_time
    print(f"Coreset construction time (s): {coreset_construction_time}")
    print(f"Coreset size: {X_coreset.shape[0]}")
    print(f"Unique classes in coreset: {y_coreset.nunique()}")
    if y_coreset.nunique() < 2:
        raise ValueError("Coreset contains only one class. Adjust the number of clusters or check data diversity.")
    start_time = time.time()
    clf = LogisticRegression(max_iter=1000, n_jobs=-1)
    clf.fit(X_coreset, y_coreset)
    end_time = time.time()
    print(f"Coreset training time (s): {end_time - start_time}")
    check_model_metrics(clf, X_test, y_test)

train_full_data(df_train, df_test)
train_downsampled_data(df_train, df_test)
train_with_coreset(df_train, df_test)


Combined DataFrame shape: (1517041, 2)
Class distribution:
 label
0    763629
4    753412
Name: count, dtype: int64

Training Data Shape: (1213632, 2)
Test Data Shape: (303409, 2)

Experiment 1: Direct HashingVectorizer
Training data shape: (1213632, 262144)
Number of features: 262144
Direct training time (s): 27.637704610824585
ACCURACY:
77.68292964282536

CONFUSION MATRIX
[[116102  36624]
 [ 31088 119595]]

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.79      0.76      0.77    152726
           4       0.77      0.79      0.78    150683

    accuracy                           0.78    303409
   macro avg       0.78      0.78      0.78    303409
weighted avg       0.78      0.78      0.78    303409


Experiment 2: Stratified Downsampling and Reduced Features
Downsampled training data shape: (100000, 1024)
Number of features: 1024
Downsampled training time (s): 1.272170066833496
ACCURACY:
69.97748913183194

CONFUSION MATRIX
[[105902  