Dataset :https://www.kaggle.com/datasets/mysarahmadbhat/lung-cancer

# Attribute Information
**Gender:** M(male), F(female)

**Age:** Age of the patient [years]

**Smoking:** YES = 1, NO = 0.

**YellowFingers:** YES = 1, NO = 0.

**Anxiety:** YES = 1, NO = 0.

**PeerPressure:** YES = 1, NO = 0.

**ChronicDisease:** YES = 1, NO = 0.

**Fatigue:** YES = 1, NO = 0.

**Allergy:** YES = 1, NO = 0.

**Wheezing:** YES = 1, NO = 0.

**AlcoholConsuming:** YES = 1, NO = 0.

**Coughing:** YES = 1, NO = 0.

**BreathShortage:** YES = 1, NO = 0.

**SwallowingDifficulty:** YES = 1, NO = 0.

**ChestPain:** YES = 1, NO = 0.

**LungCancer:** YES = 1, NO = 0.

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [29]:
# Load data
df = pd.read_csv("lung_cancer.csv")
df.head()

Unnamed: 0,Gender,Age,Smoking,YellowFingers,Anxiety,PeerPressure,ChronicDisease,Fatigue,Allergy,Wheezing,AlcoholConsuming,Coughing,BreathShortage,SwallowingDifficulty,ChestPain,LungCancer
0,M,69.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,M,74.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
2,F,59.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
3,M,63.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,F,63.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


# Exploratory data analysis

In [30]:
df.shape

(310, 16)

**Inference**
- There are 309 data entries and 16 columns, 15 of which are features and 1 being the target column

In [None]:
df.info()

In [None]:
# show number of unique entries in each column
df.nunique()

In [None]:
# data basic statistics
df.describe()

In [None]:
# missing values in decerding order
df.isnull().sum()

**Inference**
- There are no null values in the dataset, so there is no need for imputation

## Getting the numerical and categorical features

In [None]:
categorical = df.select_dtypes(include=['object'])
numerical = df.select_dtypes(include=['float64'])

print('Categorical features:\n', categorical.columns.to_list())
print('Numerical features:\n', numerical.columns.to_list())

# Visualization

In [None]:
# Normal and Lung Cancer with target column
plt.figure(figsize=(14,7))
plt.pie(df['LungCancer'].value_counts(), labels=['Lung Cancer [1]', 'Normal [0]'], autopct='%1.1f%%')
plt.show()

In [None]:
pd.options.display.float_format = "{:,.3f}".format
plt.figure(figsize=(12,7))
sns.set_context('notebook',font_scale = 0.72)
sns.heatmap(df.corr(),annot=True, linewidths=1)
plt.show()

In [None]:
# ploting numerical features with target
cols_to_plot = ["Age", "Smoking", "ChronicDisease", "Wheezing", "BreathShortage", "YellowFingers", "ChestPain"]
for i in cols_to_plot:
    plt.figure(figsize=(12,5))
    sns.countplot(x=df[i], data=df, hue='LungCancer')
    plt.legend(['Normal', 'Lung Cancer'])
    # plt.title(i)
    plt.tight_layout()

**Inference**
-

In [None]:
# ploting categorical features with target
for i in categorical:
    plt.figure(figsize=(10,5))
    # sns.set_context('notebook',font_scale = 0.72)
    sns.countplot(x=i, data=df, hue='LungCancer', edgecolor='black')
    plt.legend(['Normal', 'Lung Cancer'])
    # plt.title(i)
    plt.show()

**Inference**
-

In [40]:
# Pairplot using target LungCancer Column
# sns.pairplot(df, hue='LungCancer')
# plt.show()

In [None]:
# distribution plot of Age for LungCancer
sns.distplot(df['Age'][df['LungCancer'] == 1], kde=True, color='red', label='Lung Cancer')
sns.distplot(df['Age'][df['LungCancer'] == 0], kde=True, color='green', label='Normal')
plt.legend()
plt.show()

In [None]:
Young = df[(df.Age>=29)&(df.Age<40)]
Middle = df[(df.Age>=40)&(df.Age<55)]
Elder = df[(df.Age>55)]

plt.figure(figsize=(10,8))
sns.set_context('notebook',font_scale = 1.5)
sns.barplot(x=['Young ages','Middle ages','Elderly ages'],y=[len(Young),len(Middle),len(Elder)])
plt.show()

In [None]:
colors = ['blue','green','yellow']
explode = [0,0,0.1]
plt.figure(figsize=(10,10))
sns.set_context('notebook',font_scale = 1.0)
plt.pie([len(Young),len(Middle),len(Elder)],labels=['Young ages','Middle ages','Elderly ages'],explode=explode,colors=colors, autopct='%1.1f%%')
plt.tight_layout()

# Data Preprocessing

In [None]:
# select numerical features and encoding it
from sklearn.preprocessing import LabelEncoder

string_col = df.select_dtypes(include="object").columns

df.head()
# As we will be using both types of approches for demonstration lets do First Label Ecoding
# which will be used with Tree Based Algorithms
df_tree = df.apply(LabelEncoder().fit_transform)
df_tree.head()

This can be used right away in a lot of tree-based models:
    
- Decision trees
- Random forest
- Extra Trees
- Or any kind of boosted trees model
    - XGBoost
    - GBM
    - LightGBM
    
This type of encoding can't be used in linear models, support vector machines, or neural networks because they expect data to be normalized (or standardized). We can divide the data into two groups for these kinds of models. As
shown below:

In [None]:
## Creating one hot encoded features for working with non tree based algorithms
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer


target="LungCancer"
df_nontree = df.drop("LungCancer",axis=1)

ohe = OneHotEncoder(handle_unknown='ignore')

transformer = make_column_transformer(
    (OneHotEncoder(), string_col),
    remainder='passthrough',
    verbose_feature_names_out=False
)

transformed = transformer.fit_transform(df_nontree)
df_nontree = pd.DataFrame(
    transformed,
    columns=transformer.get_feature_names_out()
)

df_nontree.head()

In [None]:
# Getting the target column at the end
df_nontree=pd.concat([df_nontree,df[target]],axis=1)
df_nontree.head()

# Training

## Logistic Regression

In [47]:
feature_col_nontree=df_nontree.columns.to_list()
feature_col_nontree.remove(target)

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_auc_score
from sklearn.preprocessing import RobustScaler,MinMaxScaler,StandardScaler
acc_log=[]

kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=df_nontree, y=y)):

    X_train=df_nontree.loc[trn_,feature_col_nontree]
    y_train=df_nontree.loc[trn_,target]

    X_valid=df_nontree.loc[val_,feature_col_nontree]
    y_valid=df_nontree.loc[val_,target]

    #print(pd.DataFrame(X_valid).head())
    ro_scaler=MinMaxScaler()
    X_train=ro_scaler.fit_transform(X_train)
    X_valid=ro_scaler.transform(X_valid)


    clf=LogisticRegression()
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_log.append(acc)
    print(f"The accuracy for Fold {fold+1} : {acc}")
    pass


## Naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
acc_Gauss=[]
kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=df_nontree,y=y)):

    X_train=df_nontree.loc[trn_,feature_col_nontree]
    y_train=df_nontree.loc[trn_,target]

    X_valid=df_nontree.loc[val_,feature_col_nontree]
    y_valid=df_nontree.loc[val_,target]

    ro_scaler=MinMaxScaler()
    X_train=ro_scaler.fit_transform(X_train)
    X_valid=ro_scaler.transform(X_valid)

    clf=GaussianNB()
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_Gauss.append(acc)
    print(f"The accuracy for {fold+1} : {acc}")

    pass

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
acc_KNN=[]
kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=df_nontree,y=y)):

    X_train=df_nontree.loc[trn_,feature_col_nontree]
    y_train=df_nontree.loc[trn_,target]

    X_valid=df_nontree.loc[val_,feature_col_nontree]
    y_valid=df_nontree.loc[val_,target]

    ro_scaler=MinMaxScaler()
    X_train=ro_scaler.fit_transform(X_train)
    X_valid=ro_scaler.transform(X_valid)

    clf=KNeighborsClassifier(n_neighbors=15)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_KNN.append(acc)
    print(f"The accuracy for {fold+1} : {acc}")

    pass


In [None]:
acc_KNN

## Decision Tree

In [None]:
feature_col_tree=df_tree.columns.to_list()
feature_col_tree.remove(target)

In [None]:
from sklearn.tree import DecisionTreeClassifier
acc_Dtree=[]
kf=model_selection.StratifiedKFold(n_splits=5)
for fold , (trn_,val_) in enumerate(kf.split(X=df_tree,y=y)):

    X_train=df_tree.loc[trn_,feature_col_tree]
    y_train=df_tree.loc[trn_,target]

    X_valid=df_tree.loc[val_,feature_col_tree]
    y_valid=df_tree.loc[val_,target]

    clf=DecisionTreeClassifier(criterion="entropy")
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_valid)
    print(f"The fold is : {fold} : ")
    print(classification_report(y_valid,y_pred))
    acc=roc_auc_score(y_valid,y_pred)
    acc_Dtree.append(acc)
    print(f"The accuracy for {fold+1} : {acc}")


In [None]:
import graphviz
from sklearn import tree
# DOT data
dot_data = tree.export_graphviz(clf, out_file=None,
                                feature_names=feature_col_tree,
                                class_names=target,
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png")
graph