In [182]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pylab as pl
import os

# set rand seed
np.random.seed(1)

In [184]:
def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=2021)
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[id_name, target])
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[id_name, target])
    return X_train, X_test, y_train, y_test 

In [185]:
# load cervical cancer dataset
dataset_dir = r'F:\Google Drive\umich\eecs545_machine_learning\final_project'
df = pd.read_csv(os.path.join(dataset_dir, 'risk_factors_cervical_cancer.csv'))
print('loaded dataset')

loaded dataset


In [186]:
df

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,34,3.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
854,32,2.0,19.0,1.0,0.0,0.0,0.0,1.0,8.0,0.0,...,?,?,0,0,0,0,0,0,0,0
855,25,2.0,17.0,0.0,0.0,0.0,0.0,1.0,0.08,0.0,...,?,?,0,0,0,0,0,0,1,0
856,33,2.0,24.0,2.0,0.0,0.0,0.0,1.0,0.08,0.0,...,?,?,0,0,0,0,0,0,0,0


In [138]:
# split into x,y, convert to numeric values with nan
print(df.columns)
y = df['Dx:Cancer'].values
x_df = df.drop(['Dx:Cancer', 'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis', 
                'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
                'Citology', 'Biopsy'], axis=1)
x_df = x_df.apply(pd.to_numeric, errors='coerce')
x = x_df.values.astype(np.float64)
tmp = df.columns.values
x

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 'IUD',
       'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis',
       'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
       'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
       'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
       'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV',
       'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller',
       'Citology', 'Biopsy'],
      dtype='object')


array([[18.,  4., 15., ...,  0.,  0.,  0.],
       [15.,  1., 14., ...,  0.,  0.,  0.],
       [34.,  1., nan, ...,  0.,  0.,  0.],
       ...,
       [25.,  2., 17., ...,  0.,  0.,  0.],
       [33.,  2., 24., ...,  0.,  0.,  0.],
       [29.,  2., 20., ...,  0.,  0.,  0.]])

In [130]:
col_names = x_df.columns.values
np.where(col_names == 'STDs:HPV')

(array([24], dtype=int64),)

In [139]:
# replace mean values of integer rows with average, replace bools with mode

def replace_column_mean(data, column_idx, is_int=True):
    col_mean = np.nanmean(data[:, column_idx])
    if is_int:
        col_mean = int(np.round(col_mean))
    inds = np.where(np.isnan(data[:, column_idx]))
    data[inds, column_idx] = col_mean
    return data

def replace_column_mode(data, column_idx):
    col_mode = stats.mode(data[:, column_idx], nan_policy='omit')[0]
    inds = np.where(np.isnan(data[:, column_idx]))
    data[inds, column_idx] = col_mode
    return data
    
replace_int = [0, 1, 2, 3, 12, 25]
for each in replace_int:
    x = replace_column_mean(x, each, is_int=True)
    
replace_float = [5, 6, 8, 10]
for each in replace_float:
    x = replace_column_mean(x, each, is_int=False)
    
replace_bool = [4, 7, 9, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
for each in replace_bool:
    x = replace_column_mode(x, each)

In [171]:
# split into train and test
from sklearn.model_selection import train_test_split

data_train, data_test, labels_train, labels_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [175]:
# predict with xgboost
import xgboost2
import sklearn

model = xgboost2.XGBoostClassifier()
start = time.time()
model.fit(data_train, labels_train, min_num_leaf=5, boosting_rounds=50, max_depth=10, lr=0.8, reg=1.2)
end = time.time()
print('our model time: {}'.format(end-start))

boosting round 0
boosting round 1
boosting round 2
boosting round 3
boosting round 4
boosting round 5
boosting round 6
boosting round 7
boosting round 8
boosting round 9
boosting round 10
boosting round 11
boosting round 12
boosting round 13
boosting round 14
boosting round 15
boosting round 16
boosting round 17
boosting round 18
boosting round 19
boosting round 20
boosting round 21
boosting round 22
boosting round 23
boosting round 24
boosting round 25
boosting round 26
boosting round 27
boosting round 28
boosting round 29
boosting round 30
boosting round 31
boosting round 32
boosting round 33
boosting round 34
boosting round 35
boosting round 36
boosting round 37
boosting round 38
boosting round 39
boosting round 40
boosting round 41
boosting round 42
boosting round 43
boosting round 44
boosting round 45
boosting round 46
boosting round 47
boosting round 48
boosting round 49
our model time: 63.51827526092529


In [179]:
pred = model.predict(data_test)
acc = np.sum(pred == labels_test)/len(pred)
print('accuracy = {}'.format(acc))

f1_score = sklearn.metrics.f1_score(labels_test, pred)
print(f1_score)

accuracy = 0.8023255813953488
0.05555555555555555


In [180]:
labels_test

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [181]:
pred

array([0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0])