In [8]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

In [9]:
class UserModel:
    def __init__(self):
        model = Pipeline([
            ("poly", PolynomialFeatures(2)),
            ("std", StandardScaler()),
            ("lr", LogisticRegression()),
        ])
        self.model = model
    def fit(self, train_xcols, train_predictor):
        return self.model.fit(train_xcols, train_predictor)
    
    def predict(self, test_xcols):
        return self.model.predict(test_xcols)

# The Cleveland Heart Disease Dataset
There are 303 items (patients), six have a missing value. There are 13 predictor variables (age, sex, cholesterol, etc.) The variable to predict is encoded as 0 to 4 where 0 means no heart disease and 1-4 means presence of heart disease. See https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data.

In [28]:
df1 = pd.read_csv("data/processed.cleveland.data.csv", header=None)
headers = ["age", "sex", "cholesterol", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "predictor"]
df1.columns = headers
df1 = df1[df1["v13"] != "?"]
df1 = df1[df1["v12"] != "?"]
df1['predictor'] = df1['predictor'].apply(lambda x: 0 if x>=1 else 1)
train, test = train_test_split(df1)
model1 = UserModel()
model1.fit(train[headers[:-1]], train["predictor"])
y_pred = model1.predict(test[headers[:-1]])
accuracy_score(test["predictor"], y_pred)

0.84

Using Binary Classification, the model is able to predict with 84% accuracy of whether a patient has heart disease given the 13 predictor variables.

# The Wisconsin Cancer Dataset
There are 569 items (patients). There is an ID followed by 10 predictors variables (thickness, cell size uniformity, etc.) The variable to predict is encoded as 2 (benign) or 4 (malignant). See https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/

In [47]:
df2 = pd.read_csv("data/breast-cancer-wisconsin.data.csv", header=None)
headers2 = ["id", "thickness", "cell size uniformity", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "predictor"]
df2.columns = headers2
df2 = df2[df2["v7"] != "?"]
train2, test2 = train_test_split(df2)
model2 = UserModel()
model2.fit(train2[headers2[:-1]], train2["predictor"])
y_pred2 = model2.predict(test2[headers2[:-1]])
accuracy_score(y_pred2, test2["predictor"])

0.9532163742690059

Using Binary Classification, the model is able to predict with 95% accuracy whether a patient has a benign or malignant tumor given the 10 predictor variables.

# Haberman's Survival Dataset
There are 306 items (patients). There are three predictor variables (age, year of operation, number nodes). The variable to predict is encoded as 1 (survived) or 2 (died). See https://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival 

In [63]:
df3 = pd.read_csv("data/haberman.data.csv", header=None)
headers3 = ["age", "year of operation", "number of nodes", "predictor"]
df3.columns = headers3
train3, test3 = train_test_split(df3)
model3 = UserModel()
model3.fit(train3[headers3[:-1]], train3["predictor"])
y_pred3 = model3.predict(test3[headers3[:-1]])
accuracy_score(y_pred3, test3["predictor"])

0.7532467532467533

Using Binary Classification, the model is able to predict with 75% accuracy whether a patient survived or died given the patients age, year of operation, and the number of nodes