This notebook takes a raw sweep 6 cm interview dataset as an input, and outputs a baseline logistic regression model.

In [8]:
import functions
import features
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
from datetime import datetime

In [2]:
# import raw data
raw_data = pd.read_csv('raw_data.tab', sep='\t')

In [3]:
# run smfq processing (engineers a y, and removes smfq features)
X, y = functions.add_smfq_label(raw_data)

In [4]:
# drop columns not found in the feature lists

# build list of all features in all categories
full_feature_list = []
for cat in features.features:
    full_feature_list.extend(features.features[cat])

# convert feature names to codes
for feature in full_feature_list:
    test = functions.get_variable_code(feature)
    if test == 'VARIABLE NOT FOUND':
        print(f"missing feature: {feature}")
full_code_list = [functions.get_variable_code(feature) for feature in full_feature_list]

# build list of features that appear in the dataset but not in the categories
features_to_drop = []
for feature in X:
    if feature not in full_code_list:
        features_to_drop.append(feature)

# and remove them
X_dropped = X.drop(columns=features_to_drop)

In [22]:
# create datasets with variable names instead of codes
# feature_mapper = pd.read_csv('dict_csv.csv')

# variable_names = []
# for col in X:
#     try:
#         variable_name = feature_mapper[feature_mapper['Variable name'] == col].iloc[0]['Variable label']
#     except:
#         variable_name = col

#     variable_names.append(variable_name)

# X_with_names = X.copy()
# X_with_names.columns = variable_names

In [24]:
# process features
X_processed = functions.feature_processor(X_dropped, features.features)



In [41]:
# X_processed_with_names = X_processed.copy()
# variable_names = []
# variables_encountered = []
# for col in X_processed:
#     try:
#         variable_name = feature_mapper[feature_mapper['Variable name'] == col.split('_')[0]].iloc[0]['Variable label'].strip()
#         # increment = sum([1 for var in variables_encountered if var == col.split('_')[0]])
#         try:
#             increment = col.split('_')[1]
#             variable_name += f"_{increment}"
#         except:
#             pass
#         variables_encountered.append(col.split('_')[0])
#     except:
#         variable_name = col

#     variable_names.append(variable_name)

# X_processed_with_names.columns = variable_names

In [43]:
# exporting datasets
# with open('datasets/y', 'wb') as file:
#     pickle.dump(np.array(y), file)

# with open('datasets/X_processed_codes', 'wb') as file:
#     pickle.dump(X_dropped, file)

# with open('datasets/X_unprocessed_codes', 'wb') as file:
#     pickle.dump(X, file)

# with open('datasets/X_unprocessed_names', 'wb') as file:
#     pickle.dump(X_with_names, file)

# with open('datasets/X_processed_names', 'wb') as file:
#     pickle.dump(X_processed_with_names, file)

In [23]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y)

In [24]:
# define a baseline model
model_baseline = LogisticRegression(max_iter=1000)

In [25]:
# fit model to data
model_baseline.fit(X_train, y_train)

In [26]:
# precision_score(y_test, model_baseline.predict(X_test))

In [27]:
# score baseline model
# model_baseline.score(X_test, y_test)

In [28]:
# optimize hyperparameters
# grid = GridSearchCV(model_baseline, param_grid={
#     "penalty": [None, 'l2', 'l1', 'elasticnet'],
#     "C": [10, 1, 0.1, 0.01, 0.001],
#     "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"]
# })

# grid.fit(X_train, y_train)

In [29]:
# model_optimised = grid.best_estimator_

In [30]:
# f1_score(y_test, model_baseline.predict(X_test))

In [31]:
# from sklearn.dummy import DummyClassifier

In [32]:
# dummy = DummyClassifier()
# dummy.fit(X_train, y_train)
# dummy.score(X_test, y_test)

In [33]:
# save the model
datetime_string = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
file_name = f"models/model_{datetime_string}.pickle"

with open(file_name, 'wb') as file:
    pickle.dump(model_baseline, file)