In [4]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [29]:
# csv containing filename, subject_id_and_knee, is_right, TKR, visit, and all extracted features. Generated by create_feature_extractions script.

path = "feature_extract.csv"
df = pd.read_csv(path)

subject_id_and_knee = lambda row: str(row["ID"]) + ("-R" if row["is_right"] else "-L")
df["subject_id_and_knee"] = df.apply(subject_id_and_knee, axis=1)

print(f"{df.shape[0]} total samples.")

298 total samples.


In [39]:
# Identifying patients with at least n visits
n_visits = 1
visits = df["subject_id_and_knee"].value_counts()
visits_df = df[df["subject_id_and_knee"].isin(visits.index[visits.gt(n_visits - 1)])]
indices = list(list(zip(*visits_df.groupby("subject_id_and_knee")['visit'].nlargest(n_visits).index.values))[1])

In [40]:
# Rows for patients with at least n visits
subjects_df = df.iloc[indices].fillna(0.)

n_uniq = subjects_df["subject_id_and_knee"].unique().shape[0] 
print(f"{n_uniq} samples with at least {n_visits} visits.")

232 samples with at least 1 visits.


In [41]:
# Creating dataset
exclude_columns = {"ID", "is_right", "visit", "filename", "subject_id_and_knee", "TKR"}
include_columns = list(set(df.columns.values) - exclude_columns)

X = []
y = []
for subject_id in subjects_df["subject_id_and_knee"].unique():
    subject_rows = subjects_df[subjects_df["subject_id_and_knee"] == subject_id][include_columns]
    features = np.array([row.values for _, row in subject_rows.iterrows()]).reshape(-1)
    X.append(features)
    
    tkr = subjects_df[subjects_df["subject_id_and_knee"] == subject_id]["TKR"].values[0]
    y.append(int(tkr))
    
X = np.array(X)
y = np.array(y)
print(X.shape)

(232, 38)


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
print(f"{y_train.shape[0]} training samples and {y_test.shape[0]} test samples.")

174 training samples and 58 test samples.


In [43]:
# To normalize or not?
normalizer = Normalizer().fit(X_train)
X_train_normalized = normalizer.transform(X_train)
X_test_normalized = normalizer.transform(X_test)

In [44]:
reg = LinearRegression().fit(X_train_normalized, y_train)
y_pred = reg.predict(X_test_normalized)
y_pred = (y_pred >= 0.5).astype(int)

In [45]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

eps = 1e-6
accuracy = (tp + tn) / (tn + fp + fn + tp + eps)
precision = tp / (tp + fp + eps)
recall = tp / (tp + fn + eps)
specificity = tn / (tn + fp + eps)

print(f"""
Test metrics for linear regression classifier ({y_test.shape[0]} test samples):
Accuracy:    {accuracy:.4f}
Precision:   {precision:.4f}
Recall:      {recall:.4f}
Specificity: {specificity:.4f}
"""
)


Test metrics for linear regression classifier (58 test samples):
Accuracy:    0.7414
Precision:   0.5455
Recall:      0.3750
Specificity: 0.8810

