This notebook takes a raw sweep 6 cm interview dataset as an input, and outputs a baseline logistic regression model.

In [29]:
import functions
import features
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from datetime import datetime

In [2]:
# import raw data
raw_data = pd.read_csv('raw_data.tab', sep='\t')

In [3]:
# run smfq processing (engineers a y, and removes smfq features)
X, y = functions.add_smfq_label(raw_data)

In [4]:
# drop columns not found in the feature lists

# build list of all features in all categories
full_feature_list = []
for cat in features.features:
    full_feature_list.extend(features.features[cat])

# convert feature names to codes
for feature in full_feature_list:
    test = functions.get_variable_code(feature)
    if test == 'VARIABLE NOT FOUND':
        print(f"missing feature: {feature}")
full_code_list = [functions.get_variable_code(feature) for feature in full_feature_list]

# build list of features that appear in the dataset but not in the categories
features_to_drop = []
for feature in X:
    if feature not in full_code_list:
        features_to_drop.append(feature)

# and remove them
X_dropped = X.drop(columns=features_to_drop)

In [5]:
# process features
X_processed = functions.feature_processor(X_dropped, features.features)



In [6]:
X_processed.shape

(11859, 187)

In [7]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y)

In [24]:
# define a baseline model
model_baseline = LogisticRegression(max_iter=1000)

In [25]:
# fit model to data
model_baseline.fit(X_train, y_train)

In [26]:
precision_score(y_test, model_baseline.predict(X_test))

0.7125748502994012

In [27]:
# score baseline model
model_baseline.score(X_test, y_test)

0.8988195615514334

In [31]:
# optimize hyperparameters
grid = GridSearchCV(model_baseline, param_grid={
    "penalty": [None, 'l2', 'l1', 'elasticnet'],
    "C": [10, 1, 0.1, 0.01, 0.001],
    "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]
})

grid.fit(X_train, y_train)

275 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/home/alex/.pyenv/versions/lewagon/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/alex/.pyenv/versions/lewagon/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/alex/.pyenv/versions/lewagon/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1228, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  File "/home/alex/.pyenv/versions/lewago

In [38]:
model_optimised = grid.best_estimator_

In [40]:
f1_score(y_test, model_baseline.predict(X_test))

0.6134020618556701

In [None]:
# save the model
datetime_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = f"models/model_{datetime_string}.pickle"

with open(file_name, 'wb') as file:
    pickle.dump(model_baseline, file)