In [69]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('../data/lung_cancer_prediction_dataset.csv')
df.head()

Unnamed: 0,ID,Country,Population_Size,Age,Gender,Smoker,Years_of_Smoking,Cigarettes_per_Day,Passive_Smoker,Family_History,...,Air_Pollution_Exposure,Occupational_Exposure,Indoor_Pollution,Healthcare_Access,Early_Detection,Treatment_Type,Developed_or_Developing,Annual_Lung_Cancer_Deaths,Lung_Cancer_Prevalence_Rate,Mortality_Rate
0,0,China,1400,80,Male,Yes,30,29,No,No,...,Low,Yes,No,Poor,No,,Developing,690000,2.44,0.0
1,1,Iran,84,53,Male,No,0,0,Yes,No,...,Low,Yes,No,Poor,No,,Developing,27000,2.1,0.0
2,2,Mexico,128,47,Male,Yes,12,6,Yes,No,...,Medium,No,No,Poor,Yes,,Developing,28000,1.11,0.0
3,3,Indonesia,273,39,Female,No,0,0,No,Yes,...,Low,No,No,Poor,No,,Developing,40000,0.75,0.0
4,4,South Africa,59,44,Female,No,0,0,Yes,No,...,Medium,Yes,No,Poor,No,,Developing,15000,2.44,0.0


In [70]:
# Removendo colunas
df.drop(columns=[
    'ID', 
    'Population_Size', 
    'Country', 
    'Mortality_Rate', 
    'Survival_Years', 
    'Cancer_Stage', 
    'Treatment_Type', 
    'Early_Detection',
    'Developed_or_Developing', 
    'Healthcare_Access', 
    'Adenocarcinoma_Type', 
    'Occupational_Exposure', 
    'Annual_Lung_Cancer_Deaths', 
    'Lung_Cancer_Prevalence_Rate'], inplace=True)

In [71]:
null_columns = df.columns[df.isnull().any()].tolist()
display(set(null_columns))

set()

In [72]:
# Removendo colunas nulas
for col in null_columns:
    df.fillna({col: "Unknown"}, inplace=True)

df.head()

display(set(df['Lung_Cancer_Diagnosis']))

{'No', 'Yes'}

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

le = LabelEncoder()
# Codificando colunas categóricas
labels_to_encode = [
    "Lung_Cancer_Diagnosis",
    "Gender",
    "Smoker",
    "Passive_Smoker",
    "Indoor_Pollution",
    "Family_History",
]

for label in labels_to_encode:
    df[label] = le.fit_transform(df[label])


ohe = OneHotEncoder(sparse_output=False)
labels_hot_encode = [
    "Air_Pollution_Exposure"
]
for label in labels_hot_encode:
    encoded = ohe.fit_transform(df[[label]])
    df = df.join(pd.DataFrame(encoded, columns=ohe.get_feature_names_out([label])))
    df.drop(columns=[label], inplace=True)


In [90]:
dummies = pd.get_dummies(data=df, drop_first=True)

y = dummies['Lung_Cancer_Diagnosis']

dummies.drop(columns=['Lung_Cancer_Diagnosis'], inplace=True)
X = dummies[features]


dummies.head()

Unnamed: 0,Age,Gender,Smoker,Years_of_Smoking,Cigarettes_per_Day,Passive_Smoker,Family_History,Indoor_Pollution,Air_Pollution_Exposure_High,Air_Pollution_Exposure_Low,Air_Pollution_Exposure_Medium
0,80,1,1,30,29,0,0,0,0.0,1.0,0.0
1,53,1,0,0,0,1,0,0,0.0,1.0,0.0
2,47,1,1,12,6,1,0,0,0.0,0.0,1.0
3,39,0,0,0,0,0,1,0,0.0,1.0,0.0
4,44,0,0,0,0,1,0,0,0.0,0.0,1.0


In [91]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [92]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(labels, predictions):
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    rmse = np.sqrt(mse)

    erors = np.abs(labels - predictions)
    relative_errors = erors / np.abs(labels)
    mape = np.mean(relative_errors) * 100
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R^2:", r2)
    print(f"MAPE: {mape:.2f}%")

In [93]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)

model.fit(X_train,y_train)
y_pred = model.predict(X_test)

evaluate_model(y_test, y_pred)

RMSE: 0.203811662303553
MAE: 0.041539193690937525
R^2: -0.06617697225784158
MAPE: inf%


In [94]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(
    n_estimators=10, 
    max_depth=6, 
    min_samples_split=8, 
    min_samples_leaf=4, 
    max_features=8, 
    random_state=42, 
    n_jobs=-1
)

forest_clf.fit(X_train, y_train.values.ravel())
y_pred = forest_clf.predict(X_test)

evaluate_model(y_test, y_pred)

RMSE: 0.20151937210727844
MAE: 0.04061005733451175
R^2: -0.0423290421636946
MAPE: 100.00%


In [95]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

RMSE: 0.1959023656000588
MAE: 0.07663833192161841
R^2: 0.014967416584331095
MAPE: inf%


In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(ccp_alpha=0.01,max_depth=6, min_samples_split=8, min_samples_leaf=4, random_state=42)
model.fit(X_train, y_train) 
y_pred = model.predict(X_test)
evaluate_model(y_test, y_pred)

RMSE: 0.19606592470674727
MAE: 0.07665677029116513
R^2: 0.013321920320848002
MAPE: inf%
