This notebook takes a raw sweep 6 cm interview dataset as an input, and outputs a baseline logistic regression model.

In [19]:
import functions
import features
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from datetime import datetime

In [2]:
# import raw data
raw_data = pd.read_csv('raw_data.tab', sep='\t')

In [3]:
# run smfq processing (engineers a y, and removes smfq features)
X, y = functions.add_smfq_label(raw_data)

In [4]:
# process features
X_processed = functions.feature_processor(X, features.features)

In [5]:
# drop columns not found in the feature lists

# build list of all features in all categories
full_feature_list = []
for cat in features.features:
    full_feature_list.extend(features.features[cat])

# convert feature names to codes
full_code_list = [functions.get_variable_code(feature) for feature in full_feature_list]

# build list of features that appear in the dataset but not in the categories
features_to_drop = []
for feature in X_processed:
    if feature not in full_code_list:
        features_to_drop.append(feature)

# and remove them
X_processed_dropped = X_processed.drop(columns=features_to_drop)

In [11]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_processed_dropped, y)

In [13]:
# define a baseline model
model_baseline = LogisticRegression()

In [None]:
# fit model to data
model_baseline.fit(X_train, y_train)

In [None]:
# score baseline model
model_baseline.score(X_test, y_test)

In [None]:
# optimize hyperparameters
#### TBC

In [21]:
# save the model
datetime_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = f"models/model_{datetime_string}.pickle"

with open(file_name, 'wb') as file:
    pickle.dump(model_baseline, file)