In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/COM6003/program/metadata.csv')

In [None]:
info = df.info()
head = df.head()
description = df.describe(include='all')

(info, head, description)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2298 entries, 0 to 2297
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   patient_id           2298 non-null   object 
 1   lesion_id            2298 non-null   int64  
 2   smoke                1494 non-null   object 
 3   drink                1494 non-null   object 
 4   background_father    1480 non-null   object 
 5   background_mother    1476 non-null   object 
 6   age                  2298 non-null   int64  
 7   pesticide            1494 non-null   object 
 8   gender               1494 non-null   object 
 9   skin_cancer_history  1494 non-null   object 
 10  cancer_history       1494 non-null   object 
 11  has_piped_water      1494 non-null   object 
 12  has_sewage_system    1494 non-null   object 
 13  fitspatrick          1494 non-null   float64
 14  region               2298 non-null   object 
 15  diameter_1           1494 non-null   f

(None,
   patient_id  lesion_id  smoke  drink background_father background_mother  \
 0   PAT_1516       1765    NaN    NaN               NaN               NaN   
 1     PAT_46        881  False  False         POMERANIA         POMERANIA   
 2   PAT_1545       1867    NaN    NaN               NaN               NaN   
 3   PAT_1989       4061    NaN    NaN               NaN               NaN   
 4    PAT_684       1302  False   True         POMERANIA         POMERANIA   
 
    age pesticide  gender skin_cancer_history  ... diameter_2 diagnostic  \
 0    8       NaN     NaN                 NaN  ...        NaN        NEV   
 1   55     False  FEMALE                True  ...        5.0        BCC   
 2   77       NaN     NaN                 NaN  ...        NaN        ACK   
 3   75       NaN     NaN                 NaN  ...        NaN        ACK   
 4   79     False    MALE                True  ...        5.0        BCC   
 
     itch   grew   hurt  changed  bleed elevation                

In [None]:
df['diagnostic'].value_counts()

BCC    845
ACK    730
NEV    244
SEK    235
SCC    192
MEL     52
Name: diagnostic, dtype: int64

 基底细胞癌 (BCC): 845个样本

 动脉粥样硬化斑块 (ACK): 730个样本

 痣 (NEV): 244个样本

 老年斑 (SEK): 235个样本

 鳞状细胞癌 (SCC): 192个样本

 黑色素瘤 (MEL): 52个样本



In [None]:
pip install pandas scikit-learn imbalanced-learn



In [None]:
# Handling Missing Values
# We will fill missing values for categorical data with the mode and for numerical data with the median.

# Categorical columns
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

# Fill missing values with mode for categorical columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Fill missing values with median for numerical columns
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

# Check if there are any missing values left
missing_values_after = df.isnull().sum()

missing_values_after


patient_id             0
lesion_id              0
smoke                  0
drink                  0
background_father      0
background_mother      0
age                    0
pesticide              0
gender                 0
skin_cancer_history    0
cancer_history         0
has_piped_water        0
has_sewage_system      0
fitspatrick            0
region                 0
diameter_1             0
diameter_2             0
diagnostic             0
itch                   0
grew                   0
hurt                   0
changed                0
bleed                  0
elevation              0
img_id                 0
biopsed                0
dtype: int64

In [None]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder for categorical data
encoder = OneHotEncoder(sparse=False)
encoded_categorical_data = encoder.fit_transform(df[categorical_cols])

# Converting the array back to a dataframe
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=encoder.get_feature_names_out(categorical_cols))

# Merging the encoded categorical data with the numerical data
preprocessed_data = pd.concat([df[numerical_cols], encoded_categorical_df], axis=1)

# Checking the first few rows of the processed data
preprocessed_data.head()




Unnamed: 0,lesion_id,age,fitspatrick,diameter_1,diameter_2,patient_id_PAT_10,patient_id_PAT_100,patient_id_PAT_1000,patient_id_PAT_1006,patient_id_PAT_1008,...,img_id_PAT_997_14_40.png,img_id_PAT_998_17_641.png,img_id_PAT_999_20_223.png,img_id_PAT_999_20_401.png,img_id_PAT_999_20_540.png,img_id_PAT_999_20_583.png,img_id_PAT_99_153_636.png,img_id_PAT_9_17_80.png,biopsed_False,biopsed_True
0,1765,8,2.0,10.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,881,55,3.0,6.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1867,77,2.0,10.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4061,75,2.0,10.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1302,79,1.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
from sklearn.preprocessing import StandardScaler

# Standardizing numerical data
scaler = StandardScaler()

# Extracting numerical data for scaling
numerical_data = preprocessed_data[numerical_cols]

# Applying standardization
scaled_numerical_data = scaler.fit_transform(numerical_data)

# Converting the array back to a dataframe
scaled_numerical_df = pd.DataFrame(scaled_numerical_data, columns=numerical_cols)

# Merging the scaled numerical data with the encoded categorical data
final_preprocessed_data = pd.concat([scaled_numerical_df, encoded_categorical_df], axis=1)

# Checking the first few rows of the final processed data
final_preprocessed_data.head()


Unnamed: 0,lesion_id,age,fitspatrick,diameter_1,diameter_2,patient_id_PAT_10,patient_id_PAT_100,patient_id_PAT_1000,patient_id_PAT_1006,patient_id_PAT_1008,...,img_id_PAT_997_14_40.png,img_id_PAT_998_17_641.png,img_id_PAT_999_20_223.png,img_id_PAT_999_20_401.png,img_id_PAT_999_20_540.png,img_id_PAT_999_20_583.png,img_id_PAT_99_153_636.png,img_id_PAT_9_17_80.png,biopsed_False,biopsed_True
0,0.196539,-3.301454,-0.287388,-0.175732,-0.118127,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.542575,-0.343881,1.37613,-0.745672,-0.757749,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.281822,1.040515,-0.287388,-0.175732,-0.118127,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.116231,0.914661,-0.287388,-0.175732,-0.118127,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.190576,1.166369,-1.950906,-0.888157,-0.757749,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
final_preprocessed_data.shape

(2298, 3756)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Identifying the one-hot encoded columns for the 'diagnostic' variable
diagnostic_cols = [col for col in final_preprocessed_data.columns if col.startswith('diagnostic_')]
diagnostic_cols

# Separating features and the target variable for the corrected dataset
X = final_preprocessed_data.drop(diagnostic_cols, axis=1)
y = final_preprocessed_data[diagnostic_cols]

# Splitting the data into training and testing sets with stratification
# Stratification here needs to be done on the original 'diagnostic' column, which we need to reconstruct
# Since we used one-hot encoding, the original value can be reconstructed by getting the column with the highest value
y_original = y.idxmax(axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y_original, test_size=0.2, random_state=42, stratify=y_original)

# Training a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

(accuracy, classification_rep)



(0.8195652173913044,
 '                precision    recall  f1-score   support\n\ndiagnostic_ACK       0.87      0.86      0.87       146\ndiagnostic_BCC       0.79      0.96      0.87       169\ndiagnostic_MEL       0.67      0.40      0.50        10\ndiagnostic_NEV       0.76      0.80      0.78        49\ndiagnostic_SCC       0.90      0.49      0.63        39\ndiagnostic_SEK       0.82      0.57      0.68        47\n\n      accuracy                           0.82       460\n     macro avg       0.80      0.68      0.72       460\n  weighted avg       0.82      0.82      0.81       460\n')

准确度：模型在测试集上的准确度约为 81.96%。

分类报告：不同类别的精确度、召回率和F1分数如下：

diagnostic_ACK：精确度 87%，召回率 86%，F1分数 87%。

diagnostic_BCC：精确度 79%，召回率 96%，F1分数 87%。

diagnostic_MEL：精确度 67%，召回率 40%，F1分数 50%。

diagnostic_NEV：精确度 76%，召回率 80%，F1分数 78%。

diagnostic_SCC：精确度 90%，召回率 49%，F1分数 63%。

diagnostic_SEK：精确度 82%，召回率 57%，F1分数 68%。

In [None]:
# Calculating the distribution of classes in the training and testing sets

# Distribution in the training set
train_distribution = y_train.value_counts()

# Distribution in the testing set
test_distribution = y_test.value_counts()

print(train_distribution)
print(test_distribution)


diagnostic_BCC    676
diagnostic_ACK    584
diagnostic_NEV    195
diagnostic_SEK    188
diagnostic_SCC    153
diagnostic_MEL     42
dtype: int64
diagnostic_BCC    169
diagnostic_ACK    146
diagnostic_NEV     49
diagnostic_SEK     47
diagnostic_SCC     39
diagnostic_MEL     10
dtype: int64


In [None]:
from sklearn.utils import resample
# And the target column (class labels) is named 'diagnostic'

# Creating X_original and y_original from the metadata
X_original = df.drop('diagnostic', axis=1)
y_original = df['diagnostic']

# Finding the minimum sample size among all categories in the original dataset
min_samples_in_categories = y_original.value_counts().min()

# Performing balanced sampling again using the smallest category size
X_balanced = pd.DataFrame()
y_balanced = pd.Series(dtype='object')

for category in y_original.unique():
    X_category = X_original[y_original == category]
    y_category = y_original[y_original == category]

    # Ensuring that we do not exceed the actual number of samples in the category
    n_samples_cat = min(min_samples_in_categories, len(y_category))

    X_sampled_cat, y_sampled_cat = resample(X_category, y_category,
                                            replace=False,
                                            n_samples=n_samples_cat,
                                            random_state=42)

    X_balanced = pd.concat([X_balanced, X_sampled_cat])
    y_balanced = pd.concat([y_balanced, y_sampled_cat])

# Splitting the new balanced data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced,
                                                                                    test_size=0.2,
                                                                                    random_state=42,
                                                                                    stratify=y_balanced)

# Checking the distribution of classes in the new balanced training and testing sets
train_bal_distribution = y_train_bal.value_counts()
test_bal_distribution = y_test_bal.value_counts()

(min_samples_in_categories, train_bal_distribution, test_bal_distribution)




(52,
 ACK    42
 SEK    42
 BCC    42
 MEL    41
 NEV    41
 SCC    41
 dtype: int64,
 NEV    11
 SCC    11
 MEL    11
 ACK    10
 BCC    10
 SEK    10
 dtype: int64)