In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score  
from joblib import dump

In [17]:
# Load the dataset
df = pd.read_csv("survey_lung_cancer.csv")

In [18]:
# Create a copy of the dataframe
df_copy = df.copy()

In [19]:
# Encode categorical columns using LabelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [20]:
# Drop columns that are not needed for prediction
df_copy = df_copy.drop(["SMOKING", "YELLOW_FINGERS", "ANXIETY", "PEER_PRESSURE", "CHRONIC DISEASE", "WHEEZING", "ALCOHOL CONSUMING", "COUGHING", "SHORTNESS OF BREATH", "SWALLOWING DIFFICULTY", "CHEST PAIN", "LUNG_CANCER"], axis="columns")

In [21]:
# Encode remaining categorical columns
df_copy["FATIGUE "] = le.fit_transform(df["FATIGUE "])
df_copy["ALLERGY "] = le.fit_transform(df["ALLERGY "])
df_copy["SMOKING"] = le.fit_transform(df["SMOKING"])
df_copy["YELLOW_FINGERS"] = le.fit_transform(df["YELLOW_FINGERS"])
df_copy["CHRONIC DISEASE"] = le.fit_transform(df["CHRONIC DISEASE"])
df_copy["ALCOHOL CONSUMING"] = le.fit_transform(df["ALCOHOL CONSUMING"])
df_copy["COUGHING"] = le.fit_transform(df["COUGHING"])
df_copy["SHORTNESS OF BREATH"] = le.fit_transform(df["SHORTNESS OF BREATH"])
df_copy["SWALLOWING DIFFICULTY"] = le.fit_transform(df["SWALLOWING DIFFICULTY"])
df_copy["CHEST PAIN"] = le.fit_transform(df["CHEST PAIN"])
df_copy["WHEEZING"] = le.fit_transform(df["WHEEZING"])
df_copy["PEER_PRESSURE"] = le.fit_transform(df["PEER_PRESSURE"])
df_copy["ANXIETY"] = le.fit_transform(df["ANXIETY"])
df_copy["GENDER"] = le.fit_transform(df["GENDER"])

In [22]:
target = le.fit_transform(df["LUNG_CANCER"])

In [23]:
# Fill missing values in the AGE column with the mean value
avg = df_copy["AGE"].mean()
df_copy["AGE"].fillna(avg, inplace=True)

In [24]:
df_copy["AGE"] = np.where(df_copy["AGE"].between(0, 30), 0, df_copy["AGE"])
df_copy["AGE"] = np.where(df_copy["AGE"].between(30, 60), 1, df_copy["AGE"])
df_copy["AGE"] = np.where(df_copy["AGE"].between(60, 90), 2, df_copy["AGE"])


In [25]:
#input_features = [GENDER, AGE, FATIGUE, ALLERGY, SMOKING, YELLOW_FINGERS, CHRONIC_DISEASE, ALCOHOL_CONSUMING, COUGHING, SHORTNESS_OF_BREATH, SWALLOWING_DIFFICULTY, CHEST_PAIN, WHEEZING, PEER_PRESSURE, ANXIETY]

In [26]:
x = df_copy
y = target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)


In [27]:
random_forest_model = RandomForestClassifier(n_estimators=50)
random_forest_model.fit(x_train, y_train)
y_pred = random_forest_model.predict(x_test)



In [28]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.9743589743589743
Precision: 0.9863013698630136
Recall: 0.9863013698630136


In [29]:
dump(random_forest_model, 'random_forest_modelnew.joblib')

['random_forest_modelnew.joblib']

In [30]:
#output = random_forest_model.predict([input_features])