# Preprocessing

## Import Libraries

In [43]:
# !pip install -q -U pandas
# !pip install -q -U matplotlib
# !pip install -q -U numpy
# !pip install -q -U seaborn
# !pip install -q -U scikit-learn
# !pip install -q -U imbalanced-learn
# !pip install -q -U Pillow
# !pip install -q -U xgboost
# !pip install -q -U lightgbm
# !pip install -q -U keras
# !pip install -q -U tensorflow
# !pip install -q -U joblib

In [44]:
import os


import numpy as np
import pandas as pd
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# text models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier


# # for handling imbalanced data
# from collections import Counter
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import NearMiss
# from imblearn.pipeline import make_pipeline
# from sklearn.utils import class_weight

import joblib
import pickle

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

## Set Up

In [45]:
# %matplotlib inline
# plt.rcParams['figure.figsize'] = [6,4]
# cmap = mpl.colormaps['viridis']
# sns.set_theme(style='whitegrid', palette='muted', font_scale=1.5)

In [46]:
data_path = 'metadata.csv'

# Function

## Data Processing

**three skin cancers (BCC, MEL, and SCC) and three skin disease (ACK, NEV, and SEK)**

And we remove the "biopsed" feature, because:
- Avoiding bias towards biopsied cases
- Preventing data leakage from biopsy results
- Improving model generalization to cases without biopsy data
- Aligning the model with the intended use case of pre-biopsy diagnosis

In [47]:
class DataProcessing:
    def __init__(self, data: pd.DataFrame, path=None):
        self.path = path
        self.data = data.copy()
        self.data = self.data.drop(columns=['biopsed'])
        self.data['patient_id'] = self.data['patient_id'].str.replace('PAT_','',regex=False).astype('int64')
        self.data = self.data.replace(['UNK','NaN'], np.nan)
        self.missing_percentage = round(self.data.isna().sum()*100/self.data.shape[0], 1)
    
    def keepNanText_6(self):
        """
        Keep the nan text in the data, 
        if the percentage of missing value is less than 10, then impute the feature using IterativeImputer,
        else if the percentage of missing value is higher than 10, then replace the nan text with 'Unknown',
        use it as a new feature.
        """
        imputer = IterativeImputer()
        enc = LabelEncoder()
        for col, percentage in self.missing_percentage.items():
            if self.data[col].dtype == 'object':
                if percentage < 10:
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data[col] = self.data[col].fillna('Unknown')
                    self.data[col] = self.data[col].astype('str')
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = self.data[col].replace(len(enc.classes_), -1).astype('int64')

                    # output the encode method
                    encoding_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))
                    print(f"Feature '{col}':")
                    for category, encoding in encoding_mapping.items():
                        if category == 'Unknown':
                            print(f"{category}: -1")
                        else:
                            print(f"{category}: {encoding}")
                    print('-'*30)
            else:
                if percentage < 10:
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data[col] = self.data[col].fillna(-1)
                    self.data[col] = self.data[col].astype('int64')
        
        self.data.drop(columns='img_id', inplace=True)

        return self.data
    
    def dropNanText_6(self):
        """
        Drop the rows with missing values in the data,
        if the percentage of missing value is less than 10, then impute the feature using IterativeImputer,
        else if the percentage of missing value is higher than 10, then drop the columns.
        """
        imputer = IterativeImputer()
        enc = LabelEncoder()
        for col, percentage in self.missing_percentage.items():
            if self.data[col].dtype == 'object':
                if percentage < 10:
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data.drop(columns=[col], inplace=True)
            else:
                if percentage < 10:
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data.drop(columns=[col], inplace=True)

        self.data.drop(columns='img_id', inplace=True)

        return self.data
    
    def keepNanText_2(self):
        """
        Keep the nan text in the data, and change the y label to binary.
        """
        imputer = IterativeImputer()
        enc = LabelEncoder()
        for col, percentage in self.missing_percentage.items():
            if col == 'diagnostic':
                self.data['is_cancer'] = np.where(self.data['diagnostic'].isin(['BCC','MEL','SCC']),1,0)
                self.data.drop(columns=['diagnostic'], inplace=True)
            elif self.data[col].dtype == 'object':
                if percentage < 10:
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data[col] = self.data[col].fillna('Unknown')
                    self.data[col] = self.data[col].astype('str')
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = self.data[col].replace(len(enc.classes_), -1).astype('int64')

                    # output the encode method
                    encoding_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))
                    print(f"Feature '{col}':")
                    for category, encoding in encoding_mapping.items():
                        if category == 'Unknown':
                            print(f"{category}: -1")
                        else:
                            print(f"{category}: {encoding}")
                    print('-'*30)
            else:
                if percentage < 10:
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data[col] = self.data[col].fillna(-1)
                    self.data[col] = self.data[col].astype('int64')

        self.data.drop(columns='img_id', inplace=True)

        return self.data
    
    def dropNanText_2(self):
        """
        Drop the nan columns in the data, and change the y label to binary.
        """
        imputer = IterativeImputer()
        enc = LabelEncoder()
        for col, percentage in self.missing_percentage.items():
            if col == 'diagnostic':
                self.data['is_cancer'] = np.where(self.data['diagnostic'].isin(['BCC','MEL','SCC']),1,0)
                self.data.drop(columns=['diagnostic'], inplace=True)
            elif self.data[col].dtype == 'object':
                if percentage < 10:
                    self.data[col] = enc.fit_transform(self.data[col])
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data.drop(columns=[col], inplace=True)
            else:
                if percentage < 10:
                    self.data[col] = imputer.fit_transform(self.data[[col]])
                else:
                    self.data.drop(columns=[col], inplace=True)

        self.data.drop(columns='img_id', inplace=True)

        return self.data
    
    def imageIndex2(self):
        """
        return a index of two labels of images.
        """
        for i in self.data.columns:
            if i == 'img_id':
                pass
            elif i == 'diagnostic':
                self.data['is_cancer'] = np.where(self.data['diagnostic'].isin(['BCC','MEL','SCC']),1,0)
                self.data.drop(columns='diagnostic', inplace=True)
            else:
                self.data.drop(columns=[i], inplace=True)
        
        self.data = self.data.reset_index()
        
        return self.data
    
    def imageIndex6(self):
        """
        return a index of six labels of images.
        """
        enc = LabelEncoder()
        for i in self.data.columns:
            if i == 'img_id':
                pass
            elif i == 'diagnostic':
                self.data['diagnostic'] = enc.fit_transform(self.data['diagnostic'])
                # output the encode method
                encoding_mapping = dict(zip(enc.classes_, range(len(enc.classes_))))
                print(f"Feature 'diagnostic':")
                for category, encoding in encoding_mapping.items():
                    if category == 'Unknown':
                        print(f"{category}: -1")
                    else:
                        print(f"{category}: {encoding}")
            else:
                self.data.drop(columns=[i], inplace=True)
        
        self.data = self.data.reset_index()
        
        return self.data


## Imbalanced Data

In [48]:
def balance_data(X, y):
    # calculate the number of samples in each class
    class_counts = Counter(y)
    min_class_count = min(class_counts.values())

    # set the sampling strategy for both over and under sampling
    over_sample_strategy = {label: 2 * min_class_count for label in class_counts.keys() if class_counts[label] <= (2 * min_class_count)}
    under_sample_strategy = {label: 2 * min_class_count for label in class_counts.keys() if class_counts[label] > (2 * min_class_count)}

    # create a pipeline for resampling
    pipe = make_pipeline(
        SMOTE(sampling_strategy=over_sample_strategy),
        NearMiss(sampling_strategy=under_sample_strategy)
    )

    # resample the data
    X_resampled, y_resampled = pipe.fit_resample(X, y)

    # calculate the class weights
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                      classes=np.unique(y),
                                                      y=y)
    class_weights = dict(enumerate(class_weights))

    return X_resampled, y_resampled, class_weights

## Text Model

In [49]:
def textModel(x_train, y_train, x_test, y_test, class_weights=None):
    svm = SVC(kernel='linear', random_state=42)
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    print(f'Model 1: {classification_report(y_test, y_pred)}')
    
    rf = RandomForestClassifier(n_estimators=100, class_weight=class_weights, random_state=42)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    print(f'Model 2: {classification_report(y_test, y_pred)}')

    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    xgb.fit(x_train, y_train)
    y_pred = xgb.predict(x_test)
    print(f'Model 3: {classification_report(y_test, y_pred)}')

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    print(f'Model 4: {classification_report(y_test, y_pred)}')

    num_class = len(np.unique(y_train))
    if num_class == 2:
        num_class = 1

    lgbm = LGBMClassifier(num_class=num_class, class_weight=class_weights, random_state=42, verbose=-1)
    lgbm.fit(x_train, y_train)
    y_pred = lgbm.predict(x_test)
    print(f'Model 5: {classification_report(y_test, y_pred)}')

    return svm, rf, xgb, knn, lgbm

## Load Dataset

Explain of the features:
- background_father: The history of any diseases or health conditions related to the patient's father, including any history of skin cancer or other diseases that may be related to skin cancer
- background_mother: The history of any diseases or health conditions related to the patient's mother, including any history of skin cancer or other diseases that may be related to skin cancer
- has_piped_water: Indicates whether the location or area of the patient's residence has access to piped water or not
- has_sewage_system: Indicates whether the location or area of the patient's residence has a proper sewage system or not
- fitspatrick: Skin tolerance to sunlight
- itch: Whether the lesion or wound has itched or not
- elevation: Description of the of the lesion or wound relative to the skin surface of the patient
- biopsed: Whether the lesion or wound has been biopsied or not

In [50]:
metadata = pd.read_csv('metadata.csv')

## Data Information

In [51]:
metadata.shape

(2298, 26)

In [52]:
metadata.head()

Unnamed: 0,patient_id,lesion_id,smoke,drink,background_father,background_mother,age,pesticide,gender,skin_cancer_history,...,diameter_2,diagnostic,itch,grew,hurt,changed,bleed,elevation,img_id,biopsed
0,PAT_1516,1765,,,,,8,,,,...,,NEV,False,False,False,False,False,False,PAT_1516_1765_530.png,False
1,PAT_46,881,False,False,POMERANIA,POMERANIA,55,False,FEMALE,True,...,5.0,BCC,True,True,False,True,True,True,PAT_46_881_939.png,True
2,PAT_1545,1867,,,,,77,,,,...,,ACK,True,False,False,False,False,False,PAT_1545_1867_547.png,False
3,PAT_1989,4061,,,,,75,,,,...,,ACK,True,False,False,False,False,False,PAT_1989_4061_934.png,False
4,PAT_684,1302,False,True,POMERANIA,POMERANIA,79,False,MALE,True,...,5.0,BCC,True,True,False,False,True,True,PAT_684_1302_588.png,True


In [53]:
def count_is_null(data:pd.DataFrame):
    countNaN = data.isna().sum()
    return f'{countNaN}({countNaN*100/data.shape[0]:.1f}%)'
def count_is_null_unique(data:pd.DataFrame):
    return data.count()-data.nunique()
def data_info(data:pd.DataFrame):
    return data.agg(['count', 'nunique', count_is_null_unique, count_is_null, 'dtype']).T

In [54]:
data_info(metadata)

Unnamed: 0,count,nunique,count_is_null_unique,count_is_null,dtype
patient_id,2298,1373,925,0(0.0%),object
lesion_id,2298,1641,657,0(0.0%),int64
smoke,1494,2,1492,804(35.0%),object
drink,1494,2,1492,804(35.0%),object
background_father,1480,13,1467,818(35.6%),object
background_mother,1476,11,1465,822(35.8%),object
age,2298,84,2214,0(0.0%),int64
pesticide,1494,2,1492,804(35.0%),object
gender,1494,2,1492,804(35.0%),object
skin_cancer_history,1494,2,1492,804(35.0%),object


## Data Processing

In [55]:
keep_df6 = DataProcessing(metadata).keepNanText_6()

Feature 'smoke':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'drink':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'background_father':
AUSTRIA: 0
BRASIL: 1
BRAZIL: 2
CZECH: 3
GERMANY: 4
ISRAEL: 5
ITALY: 6
NETHERLANDS: 7
POLAND: 8
POMERANIA: 9
PORTUGAL: 10
SPAIN: 11
Unknown: -1
------------------------------
Feature 'background_mother':
BRAZIL: 0
FRANCE: 1
GERMANY: 2
ITALY: 3
NETHERLANDS: 4
NORWAY: 5
POLAND: 6
POMERANIA: 7
PORTUGAL: 8
SPAIN: 9
Unknown: -1
------------------------------
Feature 'pesticide':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'gender':
FEMALE: 0
MALE: 1
Unknown: -1
------------------------------
Feature 'skin_cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_piped_water':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_sewage_system':
False: 0


In [56]:
data_info(keep_df6)

Unnamed: 0,count,nunique,count_is_null_unique,count_is_null,dtype
patient_id,2298,1373,925,0(0.0%),float64
lesion_id,2298,1641,657,0(0.0%),float64
smoke,2298,3,2295,0(0.0%),int64
drink,2298,3,2295,0(0.0%),int64
background_father,2298,13,2285,0(0.0%),int64
background_mother,2298,11,2287,0(0.0%),int64
age,2298,84,2214,0(0.0%),float64
pesticide,2298,3,2295,0(0.0%),int64
gender,2298,3,2295,0(0.0%),int64
skin_cancer_history,2298,3,2295,0(0.0%),int64


In [57]:
keep_df6.drop(columns=['patient_id', 'lesion_id'], inplace=True)
keep_df6 = keep_df6.astype('int64')

In [58]:
keep_df6.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
smoke,2298.0,0.787641,0.931332,0.0,0.0,0.0,2.0,2.0
drink,2298.0,0.859878,0.905861,0.0,0.0,1.0,2.0,2.0
background_father,2298.0,8.544386,3.396583,0.0,6.0,9.0,12.0,12.0
background_mother,2298.0,6.445605,3.463831,0.0,3.0,7.0,10.0,10.0
age,2298.0,60.464752,15.894866,6.0,52.0,62.0,72.0,94.0
pesticide,2298.0,0.95953,0.859589,0.0,0.0,1.0,2.0,2.0
gender,2298.0,1.022193,0.823011,0.0,0.0,1.0,2.0,2.0
skin_cancer_history,2298.0,0.996084,0.839015,0.0,0.0,1.0,2.0,2.0
cancer_history,2298.0,1.038729,0.812279,0.0,0.0,1.0,2.0,2.0
has_piped_water,2298.0,1.101393,0.767021,0.0,1.0,1.0,2.0,2.0


## Standardize and Data Split

In [59]:
x = keep_df6.drop(columns=['diagnostic'])
columns = x.columns
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
x = pd.DataFrame(x, columns=columns)
y = keep_df6['diagnostic']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Method Three

## Data Processing

In [60]:
keep_df2 = DataProcessing(metadata).keepNanText_2()

Feature 'smoke':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'drink':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'background_father':
AUSTRIA: 0
BRASIL: 1
BRAZIL: 2
CZECH: 3
GERMANY: 4
ISRAEL: 5
ITALY: 6
NETHERLANDS: 7
POLAND: 8
POMERANIA: 9
PORTUGAL: 10
SPAIN: 11
Unknown: -1
------------------------------
Feature 'background_mother':
BRAZIL: 0
FRANCE: 1
GERMANY: 2
ITALY: 3
NETHERLANDS: 4
NORWAY: 5
POLAND: 6
POMERANIA: 7
PORTUGAL: 8
SPAIN: 9
Unknown: -1
------------------------------
Feature 'pesticide':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'gender':
FEMALE: 0
MALE: 1
Unknown: -1
------------------------------
Feature 'skin_cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'cancer_history':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_piped_water':
False: 0
True: 1
Unknown: -1
------------------------------
Feature 'has_sewage_system':
False: 0


In [61]:
keep_df2.drop(columns=['patient_id', 'lesion_id'], inplace=True)
keep_df2 = keep_df2.astype('int64')

In [62]:
keep_df2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
smoke,2298.0,0.787641,0.931332,0.0,0.0,0.0,2.0,2.0
drink,2298.0,0.859878,0.905861,0.0,0.0,1.0,2.0,2.0
background_father,2298.0,8.544386,3.396583,0.0,6.0,9.0,12.0,12.0
background_mother,2298.0,6.445605,3.463831,0.0,3.0,7.0,10.0,10.0
age,2298.0,60.464752,15.894866,6.0,52.0,62.0,72.0,94.0
pesticide,2298.0,0.95953,0.859589,0.0,0.0,1.0,2.0,2.0
gender,2298.0,1.022193,0.823011,0.0,0.0,1.0,2.0,2.0
skin_cancer_history,2298.0,0.996084,0.839015,0.0,0.0,1.0,2.0,2.0
cancer_history,2298.0,1.038729,0.812279,0.0,0.0,1.0,2.0,2.0
has_piped_water,2298.0,1.101393,0.767021,0.0,1.0,1.0,2.0,2.0


## Standardize and Data Split

In [63]:
x = keep_df2.drop(columns=['is_cancer'])


# keep only columns as the following:
'''
            'smoke', 'drink', 'background_father', 'background_mother',
            'pesticide', 'gender', 'skin_cancer_history', 'cancer_history',
            'has_piped_water', 'has_sewage_system', 'grew', 'changed'
'''

x = keep_df2[['smoke', 'drink', 'background_father', 'background_mother','pesticide', 'gender', 'skin_cancer_history', 'cancer_history','has_piped_water', 'has_sewage_system', 'grew', 'changed']]

columns = x.columns
# scaler = MinMaxScaler()
# x = scaler.fit_transform(x)
x = pd.DataFrame(x, columns=columns)
y = keep_df2['is_cancer']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## Model

In [64]:
svm, rf, xgb, knn, lgbm = textModel(x_train, y_train, x_test, y_test)

Model 1:               precision    recall  f1-score   support

           0       0.90      0.76      0.82       226
           1       0.80      0.91      0.85       234

    accuracy                           0.84       460
   macro avg       0.85      0.84      0.84       460
weighted avg       0.84      0.84      0.84       460

Model 2:               precision    recall  f1-score   support

           0       0.88      0.80      0.84       226
           1       0.82      0.89      0.85       234

    accuracy                           0.85       460
   macro avg       0.85      0.84      0.85       460
weighted avg       0.85      0.85      0.85       460

Model 3:               precision    recall  f1-score   support

           0       0.89      0.79      0.84       226
           1       0.82      0.91      0.86       234

    accuracy                           0.85       460
   macro avg       0.85      0.85      0.85       460
weighted avg       0.85      0.85      0.85    

## Stacking

In [65]:
# stack model
estimators3 = [('svc', svm), ('rf', rf), ('xgb', xgb), ('knn', knn), ('lgbm', lgbm)]
stack_model3 = StackingClassifier(estimators=estimators3, final_estimator=LGBMClassifier(verbose=-1))

In [66]:
# fit the model on the training data
stack_model3.fit(x_train, y_train)
# make predictions
y_pred = stack_model3.predict(x_test)
# calculate the classification report
stack_model3_result = classification_report(y_test, y_pred)
print(f'Stack Model: {stack_model3_result}')

Stack Model:               precision    recall  f1-score   support

           0       0.83      0.78      0.80       226
           1       0.80      0.84      0.82       234

    accuracy                           0.81       460
   macro avg       0.81      0.81      0.81       460
weighted avg       0.81      0.81      0.81       460



In [67]:
# save the model to pickle
import pickle

with open('text_model.pkl', 'wb') as f:
    pickle.dump(stack_model3, f)

stack_model3_path = 'text_model.pkl'

In [68]:
# extract training data where it is a cancer
x_train_cancer = x_train[y_train == 1]
x_train_cancer

Unnamed: 0,smoke,drink,background_father,background_mother,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,has_sewage_system,grew,changed
944,0,0,9,7,1,0,0,0,1,1,2,2
1958,0,1,4,2,0,0,0,1,1,1,0,0
210,0,0,4,2,0,0,0,1,1,1,1,0
869,1,0,6,8,1,0,0,0,1,1,1,0
299,0,0,7,2,1,1,0,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2135,0,0,6,3,1,1,0,1,1,1,0,0
330,0,0,4,2,0,0,1,1,0,0,1,0
2169,0,0,9,7,0,1,0,1,0,0,1,1
1294,0,0,9,7,0,1,0,1,1,1,2,2


In [69]:
import os, pickle
import numpy as np
from keras.models import load_model
import tensorflow as tf


# get the image model and text model path
image_model_path = 'VGG16Model2.h5'
text_model_path = 'text_model.pkl'

# # Initialize models
# img_model = load_model(image_model_path)
text_model = pickle.load(open(text_model_path, 'rb'))

# image processing for model
def img_preprocessing(img):
    """ Image preprocessing function """
    img = tf.io.read_file(img)  # Read the image file
    img = tf.image.decode_png(img, channels=3)  # Decode the PNG image
    img = tf.image.resize(img, (256, 256))  # Resize the image, image size is (256, 256)
    img = tf.cast(img, tf.float32) / 255.0  # Normalize pixel values to [0, 1] range
    return img

# Prediction function for the image
def predict_image(model, img):
    """ Predict the image """
    img = tf.expand_dims(img, axis=0)  # Expand the image dimensions
    prediction = model.predict(img)  # Predict the image
    return prediction

# create a new data frame having the following columns, data range from 0 to 1
'''
            'smoke', 'drink', 'background_father', 'background_mother',
            'pesticide', 'gender', 'skin_cancer_history', 'cancer_history',
            'has_piped_water', 'has_sewage_system', 'grew', 'changed'
'''
trial_list = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

# # define an image path
# image_file = 'test1.png'
# image = img_preprocessing(image_file)

# predict the image
# # image_prediction = predict_image(img_model, image)

# predict the text
text_prediction = text_model.predict(trial_list)

# combine the prediction
combined_prediction = np.concatenate((text_prediction), axis=None)
str(combined_prediction)




'[1]'

In [70]:
# check the training data has how many columns
x_test.columns

Index(['smoke', 'drink', 'background_father', 'background_mother', 'pesticide',
       'gender', 'skin_cancer_history', 'cancer_history', 'has_piped_water',
       'has_sewage_system', 'grew', 'changed'],
      dtype='object')

In [71]:
# check the range of the data
x_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
smoke,460.0,0.730435,0.905772,0.0,0.0,0.0,2.0,2.0
drink,460.0,0.797826,0.885858,0.0,0.0,0.0,2.0,2.0
background_father,460.0,8.334783,3.331517,1.0,4.0,9.0,12.0,12.0
background_mother,460.0,6.13913,3.40226,0.0,2.0,7.0,10.0,10.0
pesticide,460.0,0.9,0.844565,0.0,0.0,1.0,2.0,2.0
gender,460.0,0.956522,0.815336,0.0,0.0,1.0,2.0,2.0
skin_cancer_history,460.0,0.93913,0.824853,0.0,0.0,1.0,2.0,2.0
cancer_history,460.0,0.969565,0.807878,0.0,0.0,1.0,2.0,2.0
has_piped_water,460.0,1.052174,0.75371,0.0,0.0,1.0,2.0,2.0
has_sewage_system,460.0,1.015217,0.779494,0.0,0.0,1.0,2.0,2.0


In [72]:
# export the x_test data range to a csv file, binning the data to 3 bins
x_test.describe().T.to_csv('data_range.csv')