In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv(r'lung_cancer.csv')


In [3]:
# Selecting a subset of features (independent variables)
X = data[['AGE', 'SMOKING', 'COUGHING', 'CHEST_PAIN']]  # Features

# Setting the target variable
y = data['LUNG_CANCER']  # Target


In [4]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
data['GENDER'] = LabelEncoder().fit_transform(data['GENDER'])
data['SMOKING'] = LabelEncoder().fit_transform(data['SMOKING'])
data['ANXIETY'] = LabelEncoder().fit_transform(data['ANXIETY'])


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
print(data.isnull().sum())


GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC_DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL_CONSUMING        0
COUGHING                 0
SHORTNESS_OF_BREATH      0
SWALLOWING_DIFFICULTY    0
CHEST_PAIN               0
LUNG_CANCER              0
dtype: int64


In [7]:
# Drop rows where the 'LUNG_CANCER' column has missing values
data = data.dropna(subset=['LUNG_CANCER'])



In [8]:
# Selecting the features (independent variables)
X = data[['AGE', 'SMOKING', 'COUGHING', 'CHEST_PAIN']]  # Features

# Setting the target variable
y = data['LUNG_CANCER']  # Target


In [9]:
# Fill missing values in the 'AGE' column with the mean value of that column
data['AGE'].fillna(data['AGE'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['AGE'].fillna(data['AGE'].mean(), inplace=True)


In [10]:
data = data.dropna()


In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Encode categorical target variable 'LUNG_CANCER'
data['LUNG_CANCER'] = LabelEncoder().fit_transform(data['LUNG_CANCER'])

# Select features and target variable
X = data[['AGE', 'SMOKING', 'COUGHING', 'CHEST_PAIN']]  # Features
y = data['LUNG_CANCER']  # Target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create and train the model
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)



In [14]:
import joblib
joblib.dump(model, 'lung_cancer_model.pkl')


['lung_cancer_model.pkl']

In [12]:
pip install streamlit

Collecting streamlitNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.0.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-18.1.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting rich<14,>=10.14.0 (from streamlit)
  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.3 k