In [1]:
import numpy as np

# Load X data
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')

print(X_train.shape)
print(y_train.shape)

(814183, 4142)
(814183,)


In [2]:
y_train = np.array([0 if i < 1 else 1 for i in y_train])

In [3]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

X_test = np.load('X_test.npy')
y_test = np.load('y_test.npy')

y_test = np.array([0 if i < 1 else 1 for i in y_test])

print(X_test.shape)
print(y_test.shape)

(207165, 4142)
(207165,)


In [4]:
def score_model(mdl, X_test: np.array, y_test: np.array) -> tuple[float]:
    '''Score the model'''
    
    # Get predictions
    preds = mdl.predict(X_test)
    
    # Get the scores
    acc = accuracy_score(y_test, preds)
    pre = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    
    # Print the scores
    print(f'Accuracy:  {acc:.2%}')
    print(f'Precision: {pre:.2%}')
    print(f'Recall:    {rec:.2%}')
    print(f'f1:        {f1:.2%}')
    
    return (acc, pre, rec, f1)

class MostFrequenctClassifer:
    def __init__(self):
        pass
    
    def fit(self, X_train: np.array, y_train: np.array):
        '''Fits a model'''
        
        first = True
        count = (None, None)
        for i in np.unique(y_train):
            if first:
                count = (i, len(y_train[y_train == i]))
                first = False
            if (y := len(y_train[y_train == i])) > count[1]:
                count = (i, y)
        
        self.most_frequent = i
        

    def predict(self, X_test: np.array) -> np.array:
        '''Predicts the most frequent classifier'''
        
        return np.array([self.most_frequent] * len(X_test))
    
class RandomClassifier:
    def __init__(self):
        pass
    
    def fit(self, X_train: np.array, y_train: np.array):
        '''Fits to the data'''
        
        self.values = np.unique(y_train)
    
    def predict(self, X_test: np.array) -> np.array:
        '''Predicts randomly'''
        
        return np.random.choice(self.values, len(X_test))


In [5]:
# Most frequent
clf = MostFrequenctClassifer()
clf.fit(X_train, y_train)
scores = {}
scores['Most Frequency'] = score_model(clf, X_test, y_test)

Accuracy:  29.04%
Precision: 29.04%
Recall:    100.00%
f1:        45.01%


This is interesting because the training data has wayyyyy more 1's than 0's but the test data has a lot more 0's than 1's. 

In [6]:
# Random classifier
clf = RandomClassifier()
clf.fit(X_train, y_train)
scores['Random'] = score_model(clf, X_test, y_test)

Accuracy:  50.10%
Precision: 29.12%
Recall:    50.10%
f1:        36.83%


Accuracy of 50%, that is exactly expected

In [7]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(random_state=42)
clf.fit(X_train, y_train)
scores['SGDClassifier'] = score_model(clf, X_test, y_test)

Accuracy:  76.94%
Precision: 66.50%
Recall:    41.51%
f1:        51.11%


In [None]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X_train, y_train)
scores['SVC'] = score_model(clf, X_test, y_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
scores['KNN'] = score_model(clf, X_test, y_test)

In [8]:
from sklearn.decomposition import PCA

# Build PCA model
pca = PCA(n_components=2).fit(X_train)

# Transform X data
X_train_reduced = pca.transform(X_train)
X_test_reduced = pca.transform(X_test)

# Create classifier
clf = SGDClassifier(random_state=42)
clf.fit(X_train_reduced, y_train)
scores['SGDClassifier PCA'] = score_model(clf, X_test_reduced, y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy:  70.96%
Precision: 0.00%
Recall:    0.00%
f1:        0.00%


In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train, y_train)
scores['KNN'] = score_model(clf, X_test, y_test)

Python(8101) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
