In [5]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import scipy as sp
import statsmodels as sm
import pyspark as ps
import os

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

#Spark session
spark = SparkSession.builder \
    .appName("RandomForest") \
    .getOrCreate()

In [7]:
#data path
dataset_path = "C:\Users\Windl.DESKTOP-BF4IBNR\Downloads\oralcancer"

# Read the dataset
df = spark.read.csv(dataset_path, header=True, inferSchema=True)
print(df.head())

Row(ID=1, Country='Italy', Age=36, Gender='Female', Tobacco Use=1, Alcohol Consumption=1, HPV Infection=1, Betel Quid Use=0, Chronic Sun Exposure=0, Poor Oral Hygiene=1, Diet (Fruits & Vegetables Intake)='Low', Family History of Cancer=0, Compromised Immune System=0, Oral Lesions=0, Unexplained Bleeding=0, Difficulty Swallowing=0, White or Red Patches in Mouth=0, Tumor Size (cm)=0.0, Cancer Stage=0, Treatment Type='No Treatment', Survival Rate (5-Year, %)=100.0, Cost of Treatment (USD)=0.0, Economic Burden (Lost Workdays per Year)=0, Early Diag0sis=0, Oral Cancer (Diag0sis)=0)


In [19]:
#sklearn train/test split, randomforest with accuracy score, AUC, recall and precision
from sklearn.model_selection import train_test_split
from pyspark.ml.classification import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score
from pyspark.ml.feature import VectorAssembler

In [13]:
#preprocess
unique_values = df.select('Oral Cancer (Diag0sis)').distinct().collect()
print(unique_values)

[Row(Oral Cancer (Diag0sis)=1), Row(Oral Cancer (Diag0sis)=0)]


In [9]:
#split train and test data
train_data, test_data = df.randomSplit([0.8, 0.2], seed=101)


In [10]:
# cancer present /absent
cancer_absent = df.filter(df['Oral Cancer (Diag0sis)'] == 0).count()
cancer_present = df.filter(df['Oral Cancer (Diag0sis)'] == 1).count()

In [32]:
# Identify feature columns (excluding ID, Country, Gender, Diet, and the label column itself)
feature_columns = [
    col for col in train_data.columns
    if col not in [
        'ID', 'Country', 'Gender', 'Diet (Fruits & Vegetables Intake)', 'Treatment Type',
        'Oral Cancer (Diag0sis)', 'Cancer Stage', 'Tumor Size (cm)'
    ]
]

# Assemble feature columns into a single vector column named 'features'
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Transform the training and testing data
train_data_transformed = assembler.transform(train_data)
test_data_transformed = assembler.transform(test_data)

#run randomforest model
model = RandomForestClassifier(labelCol='Oral Cancer (Diag0sis)', featuresCol='features', numTrees=250)
model = model.fit(train_data_transformed)

In [51]:
#calculate accuracy, AUC, recall and precision on test data
predictions = model.transform(test_data_transformed)
y_true = [row['Oral Cancer (Diag0sis)'] for row in test_data_transformed.select('Oral Cancer (Diag0sis)').collect()]
y_pred = [row['prediction'] for row in predictions.select('prediction').collect()]
y_prob = [row['probability'][1] for row in predictions.select('probability').collect()]

accuracy = accuracy_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_prob)
recall = recall_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)

print("Accuracy:", accuracy)
print("AUC:", auc)
print("Recall:", recall)
print("Precision:", precision)

Accuracy: 1.0
AUC: 1.0
Recall: 1.0
Precision: 1.0


In [50]:
print("Correlation of 'Early Diag0sis' with 'Oral Cancer (Diag0sis)':")
display(df.stat.corr('Early Diag0sis', 'Oral Cancer (Diag0sis)'))

print("Correlation of 'Cancer Stage' with 'Oral Cancer (Diag0sis)':")
display(df.stat.corr('Cancer Stage', 'Oral Cancer (Diag0sis)'))

print("Correlation of 'Tumor Size (cm)' with 'Oral Cancer (Diag0sis)':")
display(df.stat.corr('Tumor Size (cm)', 'Oral Cancer (Diag0sis)'))

print("Correlation of 'Age' with 'Oral Cancer (Diag0sis)':")
display(df.stat.corr('Age', 'Oral Cancer (Diag0sis)'))

print("Correlation of 'TreatmentType_indexed' with 'Oral Cancer (Diag0sis)':")
display(df.stat.corr('TreatmentType_indexed', 'Oral Cancer (Diag0sis)'))

print("Correlation of 'Diet_indexed' with 'Oral Cancer (Diag0sis)':")
display(df.stat.corr('Diet_indexed', 'Oral Cancer (Diag0sis)'))

Correlation of 'Early Diag0sis' with 'Oral Cancer (Diag0sis)':


0.0007257300262715145

Correlation of 'Cancer Stage' with 'Oral Cancer (Diag0sis)':


0.8368414278511452

Correlation of 'Tumor Size (cm)' with 'Oral Cancer (Diag0sis)':


0.8637807170145679

Correlation of 'Age' with 'Oral Cancer (Diag0sis)':


0.0023082653836845883

Correlation of 'TreatmentType_indexed' with 'Oral Cancer (Diag0sis)':


0.7076404299785115

Correlation of 'Diet_indexed' with 'Oral Cancer (Diag0sis)':


-0.0018898896271861327

In [36]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Tobacco Use: integer (nullable = true)
 |-- Alcohol Consumption: integer (nullable = true)
 |-- HPV Infection: integer (nullable = true)
 |-- Betel Quid Use: integer (nullable = true)
 |-- Chronic Sun Exposure: integer (nullable = true)
 |-- Poor Oral Hygiene: integer (nullable = true)
 |-- Diet (Fruits & Vegetables Intake): string (nullable = true)
 |-- Family History of Cancer: integer (nullable = true)
 |-- Compromised Immune System: integer (nullable = true)
 |-- Oral Lesions: integer (nullable = true)
 |-- Unexplained Bleeding: integer (nullable = true)
 |-- Difficulty Swallowing: integer (nullable = true)
 |-- White or Red Patches in Mouth: integer (nullable = true)
 |-- Tumor Size (cm): double (nullable = true)
 |-- Cancer Stage: integer (nullable = true)
 |-- Treatment Type: string (nullable = true)
 |-- Survival Rate (5-Y

In [40]:
from pyspark.ml.feature import StringIndexer

# Re-load the original DataFrame to ensure a clean state
# This prevents the 'Output column already exists' error if the cell is run multiple times
# and ensures the StringIndexer is applied to the untransformed data.
df = spark.read.csv(dataset_path, header=True, inferSchema=True)

# Create a StringIndexer for 'Treatment Type'
indexer_treatment = StringIndexer(inputCol="Treatment Type", outputCol="TreatmentType_indexed")

# Fit the indexer to the DataFrame and transform it
df = indexer_treatment.fit(df).transform(df)

# Create a StringIndexer for 'Diet (Fruits & Vegetables Intake)'
indexer_diet = StringIndexer(inputCol="Diet (Fruits & Vegetables Intake)", outputCol="Diet_indexed")

# Fit the indexer to the DataFrame and transform it
df = indexer_diet.fit(df).transform(df)

# Display the updated schema to confirm the new columns
df.printSchema()

# Show a sample of the updated DataFrame with the new indexed columns
df.select("Treatment Type", "TreatmentType_indexed", "Diet (Fruits & Vegetables Intake)", "Diet_indexed").show(5)

root
 |-- ID: integer (nullable = true)
 |-- Country: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Tobacco Use: integer (nullable = true)
 |-- Alcohol Consumption: integer (nullable = true)
 |-- HPV Infection: integer (nullable = true)
 |-- Betel Quid Use: integer (nullable = true)
 |-- Chronic Sun Exposure: integer (nullable = true)
 |-- Poor Oral Hygiene: integer (nullable = true)
 |-- Diet (Fruits & Vegetables Intake): string (nullable = true)
 |-- Family History of Cancer: integer (nullable = true)
 |-- Compromised Immune System: integer (nullable = true)
 |-- Oral Lesions: integer (nullable = true)
 |-- Unexplained Bleeding: integer (nullable = true)
 |-- Difficulty Swallowing: integer (nullable = true)
 |-- White or Red Patches in Mouth: integer (nullable = true)
 |-- Tumor Size (cm): double (nullable = true)
 |-- Cancer Stage: integer (nullable = true)
 |-- Treatment Type: string (nullable = true)
 |-- Survival Rate (5-Y

In [43]:
from pyspark.sql.functions import when, col

# Update the 'Diet_indexed' column to map 'High' to 3.0
# Keeping the other mappings consistent with the StringIndexer output for 'Moderate' and 'Low'
df = df.withColumn(
    "Diet_indexed",
    when(col("Diet (Fruits & Vegetables Intake)") == "High", 3.0)
    .when(col("Diet (Fruits & Vegetables Intake)") == "Moderate", 2.0)
    .when(col("Diet (Fruits & Vegetables Intake)") == "Low", 1.0)
    .otherwise(col("Diet_indexed")) # This covers any other cases, if they exist, keeping their original indexed value
)

# Display the updated mappings for verification
df.select("Diet (Fruits & Vegetables Intake)", "Diet_indexed").distinct().sort("Diet_indexed").show()

+---------------------------------+------------+
|Diet (Fruits & Vegetables Intake)|Diet_indexed|
+---------------------------------+------------+
|                              Low|         1.0|
|                         Moderate|         2.0|
|                             High|         3.0|
+---------------------------------+------------+



In [45]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

# 1. Re-split the df DataFrame into new training and testing datasets
train_data_new, test_data_new = df.randomSplit([0.8, 0.2], seed=101)

# Start with the original numerical columns that are not to be excluded
clean_numerical_cols = [
    col for col in df.columns
    if df.schema[col].dataType.simpleString() in ['int', 'double']
    and col not in ['ID', 'Oral Cancer (Diag0sis)', 'Cancer Stage', 'Tumor Size (cm)']
]

# 2. Define updated_feature_columns - clean_numerical_cols already includes the indexed columns.
updated_feature_columns = clean_numerical_cols

# 3. Create a new VectorAssembler instance
assembler_new = VectorAssembler(inputCols=updated_feature_columns, outputCol='features')

# 4. Transform train_data_new and test_data_new
train_data_transformed_new = assembler_new.transform(train_data_new)
test_data_transformed_new = assembler_new.transform(test_data_new)

# 5. Instantiate a new RandomForestClassifier model
new_model = RandomForestClassifier(labelCol='Oral Cancer (Diag0sis)', featuresCol='features', numTrees=250)

# 6. Fit the new RandomForestClassifier model to train_data_transformed_new
new_model = new_model.fit(train_data_transformed_new)

print("Model re-trained with updated feature columns.")

Model re-trained with updated feature columns.
