In [None]:
!pip install category_encoders
!pip install xgboost

In [11]:
# Input file name (can include path to the file if it's not in the same folder)
file_name = "data.csv" 

In [None]:
import numpy as np
import pandas as pd
import sklearn
import pickle
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.impute import SimpleImputer

# Get file method
def get(fileName):
  with open(fileName, 'rb') as f:
      return pickle.load(f)
  
# Load test data 
data = pd.read_csv(file_name, encoding="utf-8")

y_test = data["cancer_type"]
x_test = data.drop(columns = ["cancer_type", "patient_id"])

# Extract categorical and numerical columns 
def get_categorical_columns(df):
    return list(df.select_dtypes(include=['object', 'category']).columns)
def get_numerical_columns(df):
    return list(df.select_dtypes(include=['number']).columns)

categorical_columns = get_categorical_columns(x_test)
numerical_columns = get_numerical_columns(x_test)

# Encode the data 
y_test = LabelEncoder().fit_transform(y_test)

encoder = ce.TargetEncoder(cols=categorical_columns)
x_test = encoder.fit_transform(x_test, y_test)

# Handle null values by using imputation 
numerical_imputer = SimpleImputer(strategy='mean') 
categorical_imputer = SimpleImputer(strategy='most_frequent')  

if numerical_columns:
    x_test[numerical_columns] = numerical_imputer.fit_transform(x_test[numerical_columns])

if categorical_columns:
    x_test[categorical_columns] = categorical_imputer.fit_transform(x_test[categorical_columns])

# Retrieve selected features that were generated from main training model
selected_features_mask = get('selected_features_mask.pkl')

# Remove selected features from test data 
x_test = x_test.iloc[:, selected_features_mask]

# Retrieve best model generated from training (XGBoost Model)
best_model = get('best_model.pkl')

# Predict and generate needed metrics 
test_pred = best_model.predict(x_test)

accuracy = accuracy_score(y_test, test_pred)
precision = precision_score(y_test, test_pred, average="macro")
recall = recall_score(y_test, test_pred, average="macro")
f1 = f1_score(y_test, test_pred, average="macro")

print("accuracy =", accuracy)
print("precision =", precision)
print("recall =", recall)
print("f1-score =", f1)