In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings

warnings.filterwarnings('ignore')

from ucimlrepo import fetch_ucirepo

# fetch dataset
bone_marrow_transplant_children = fetch_ucirepo(id=565)

# data (as pandas dataframes)
x = bone_marrow_transplant_children.data.features
y = bone_marrow_transplant_children.data.targets

# metadata
print(bone_marrow_transplant_children.metadata)

# variable information
print(bone_marrow_transplant_children.variables)
bone_marrow_transplant_children.data.features
from ucimlrepo import fetch_ucirepo
import pandas as pd

# Fetch dataset
bone_marrow_transplant_children = fetch_ucirepo(id=565)

# Convert features and targets to DataFrames
x_df = pd.DataFrame(data=bone_marrow_transplant_children.data.features,
                    columns=bone_marrow_transplant_children.feature_names)
y_df = pd.DataFrame(data=bone_marrow_transplant_children.data.targets,
                    columns=bone_marrow_transplant_children.target_names)

# Concatenate the features and targets DataFrames horizontally
bone_marrow_transplant_df = pd.concat([x_df, y_df], axis=1)

# Now, bone_marrow_transplant_df contains the entire dataset as a DataFrame
print(bone_marrow_transplant_df)
column_names = bone_marrow_transplant_df.columns.tolist()

# Print the list of column names
print(column_names)
bone_marrow_transplant_df.head()
df = bone_marrow_transplant_df
df.isnull().sum()
categorical_columns = ['RecipientABO', 'RecipientRh', 'ABOmatch','CMVstatus','DonorCMV','RecipientCMV','Antigen','Allele','extcGvHD']
numerical_columns = ['CD3dCD34','CD3dkgx10d8','Rbodymass']
# Replace missing values with the mode for the specified categorical columns
for column in categorical_columns:
    mode_value = bone_marrow_transplant_df[column].mode()[0]
    bone_marrow_transplant_df[column].fillna(mode_value, inplace=True)
for column in numerical_columns:
    median_value = bone_marrow_transplant_df[column].median()
    bone_marrow_transplant_df[column].fillna(median_value, inplace=True)
df = bone_marrow_transplant_df
df.isnull().sum()
target_columns = ['IIIV', 'aGvHDIIIIV', 'extcGvHD', 'time_to_aGvHD_III_IV', 'survival_time', 'survival_status']
from sksurv.preprocessing import OneHotEncoder
X = df.drop(target_columns, axis=1)
encoder = OneHotEncoder()
Y = df[['IIIV']]
Y['IIIV'] = Y['IIIV'].astype('category')
X['ABOmatch']=X['ABOmatch'].astype('category')
X['Gendermatch']=X['Gendermatch'].astype('category')
X['HLAmatch']=X['HLAmatch'].astype('category')
X['HLAmismatch']=X['HLAmismatch'].astype('category')
X['Donorage35']=X['Donorage35'].astype('category')
X['Recipientage10']=X['Recipientage10'].astype('category')
X['Recipientgender']=X['Recipientgender'].astype('category')
X['Stemcellsource']=X['Stemcellsource'].astype('category')
X['Riskgroup']=X['Riskgroup'].astype('category')
X['DonorABO']=X['DonorABO'].astype('category')
X['RecipientABO']=X['RecipientABO'].astype('category')
X['RecipientRh']=X['RecipientRh'].astype('category')
X['CMVstatus']=X['CMVstatus'].astype('category')
X['Disease']=X['Disease'].astype('category')
X['DonorCMV']=X['DonorCMV'].astype('category')
X['RecipientCMV']=X['RecipientCMV'].astype('category')
X['Diseasegroup']=X['Diseasegroup'].astype('category')
X['Relapse']=X['Relapse'].astype('category')
X['Antigen']=X['Antigen'].astype('category')
X['Allele']=X['Allele'].astype('category')
X['HLAgrI']=X['HLAgrI'].astype('category')
X['Recipientageint']=X['Recipientageint'].astype('category')

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
X_train.shape, X_test.shape
cols = X_train.columns
from sklearn.preprocessing import MinMaxScaler
# Instantiate the MinMaxScaler
scaler = MinMaxScaler()
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
# Fit the scaler to your training data and transform it
X_train = scaler.fit_transform(X_train)
# Transform the test data using the same scaler
X_test = scaler.transform(X_test)

# import metrics to compute accuracy
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# instantiate classifier with default hyperparameters
svc=SVC()


# fit classifier to training set
svc.fit(X_train,Y_train)


# make predictions on test set
Y_pred=svc.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(Y_test, Y_pred)))
svc=SVC(C=1000.0)


# fit classifier to training set
svc.fit(X_train,Y_train)


# make predictions on test set
Y_pred=svc.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with rbf kernel and C=1000.0 : {0:0.4f}'. format(accuracy_score(Y_test, Y_pred)))
linear_svc=SVC(kernel='linear', C=1.0)


# fit classifier to training set
linear_svc.fit(X_train,Y_train)


# make predictions on test set
Y_pred_test=linear_svc.predict(X_test)


# compute and print accuracy score
print('Model accuracy score with linear kernel and C=1.0 : {0:0.4f}'. format(accuracy_score(Y_test, Y_pred_test)))


{'uci_id': 565, 'name': 'Bone marrow transplant: children', 'repository_url': 'https://archive.ics.uci.edu/dataset/565/bone+marrow+transplant+children', 'data_url': 'https://archive.ics.uci.edu/static/public/565/data.csv', 'abstract': 'The data set describes pediatric patients with several hematologic diseases, who were subject to the unmanipulated allogeneic unrelated donor hematopoietic stem cell transplantation.', 'area': 'Life Science', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 187, 'num_features': 36, 'feature_types': ['Integer', 'Real'], 'demographics': ['Gender', 'Age'], 'target_col': ['survival_status'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2020, 'last_updated': 'Sat Sep 09 2023', 'dataset_doi': '10.24432/C5NP6Z', 'creators': ['Marek Sikora', 'Łukasz Wróbel', 'Adam Gudyś'], 'intro_paper': {'title': 'GuideR: a guided separate-and-conquer rule learning in 

In [11]:
print('Training set score: {:.4f}'.format(linear_svc.score(X_train, Y_train)))

print('Test set score: {:.4f}'.format(linear_svc.score(X_test, Y_test)))

Training set score: 0.7651
Test set score: 0.5263


In [16]:
import pickle

# ... your preprocessing code ...

# Save the encoder to a file
with open('encoder.pkl', 'wb') as encoder_file:
    pickle.dump(encoder, encoder_file)

# Save the scaler to a file
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [17]:
import pickle 
pickle.dump(linear_svc,open('model.pkl','wb'))