In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

Data Collection and Pre Processing

In [2]:
#Ignore warning messages
warnings.filterwarnings('ignore')

In [3]:
# loading the csv data to a Pandas DataFrame
lung_data = pd.read_csv('D:\machine learning\lung_cancer.csv')

In [4]:
# print first 5 rows of the dataset
lung_data.head()

Unnamed: 0,Patient Id,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swal0ing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
0,P1,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,0
1,P103,52,2,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,0
2,P104,28,2,3,1,4,3,2,3,4,...,3,2,2,4,2,2,3,4,3,0
3,P110,27,2,3,1,4,2,3,2,3,...,2,2,3,4,1,5,2,6,2,0
4,P128,33,1,2,4,5,4,3,2,2,...,3,4,2,2,3,1,2,3,4,0


In [5]:
# number of rows and columns in the dataset
lung_data.shape

(1000, 25)

In [6]:
# getting some info about the data
lung_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Patient Id                1000 non-null   object
 1   Age                       1000 non-null   int64 
 2   Gender                    1000 non-null   int64 
 3   Air Pollution             1000 non-null   int64 
 4   Alcohol use               1000 non-null   int64 
 5   Dust Allergy              1000 non-null   int64 
 6   OccuPational Hazards      1000 non-null   int64 
 7   Genetic Risk              1000 non-null   int64 
 8   chronic Lung Disease      1000 non-null   int64 
 9   Balanced Diet             1000 non-null   int64 
 10  Obesity                   1000 non-null   int64 
 11  Smoking                   1000 non-null   int64 
 12  Passive Smoker            1000 non-null   int64 
 13  Chest Pain                1000 non-null   int64 
 14  Coughing of Blood        

In [8]:
lung_data['Level'].unique()

array([0, 1], dtype=int64)

In [9]:
# checking for missing values
lung_data.isnull().sum()

Patient Id                  0
Age                         0
Gender                      0
Air Pollution               0
Alcohol use                 0
Dust Allergy                0
OccuPational Hazards        0
Genetic Risk                0
chronic Lung Disease        0
Balanced Diet               0
Obesity                     0
Smoking                     0
Passive Smoker              0
Chest Pain                  0
Coughing of Blood           0
Fatigue                     0
Weight Loss                 0
Shortness of Breath         0
Wheezing                    0
Swal0ing Difficulty         0
Clubbing of Finger Nails    0
Frequent Cold               0
Dry Cough                   0
Snoring                     0
Level                       0
dtype: int64

In [10]:
# statistical measures about the data
lung_data.describe()

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swal0ing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring,Level
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,37.174,1.402,3.84,4.563,5.165,4.84,4.58,4.38,4.491,4.465,...,3.856,3.855,4.24,3.777,3.746,3.923,3.536,3.853,2.926,0.561
std,12.005493,0.490547,2.0304,2.620477,1.980833,2.107805,2.126999,1.848518,2.135528,2.124921,...,2.244616,2.206546,2.285087,2.041921,2.270383,2.388048,1.832502,2.039007,1.474686,0.496513
min,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,27.75,1.0,2.0,2.0,4.0,3.0,2.0,3.0,2.0,3.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0
50%,36.0,1.0,3.0,5.0,6.0,5.0,5.0,4.0,4.0,4.0,...,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,1.0
75%,45.0,2.0,6.0,7.0,7.0,7.0,7.0,6.0,7.0,7.0,...,5.0,6.0,6.0,5.0,5.0,5.0,5.0,6.0,4.0,1.0
max,73.0,2.0,8.0,8.0,8.0,8.0,7.0,7.0,7.0,7.0,...,9.0,8.0,9.0,8.0,8.0,9.0,7.0,7.0,7.0,1.0


In [11]:
# find duplicate rows
duplicate_row = lung_data[lung_data.duplicated()]
print("Duplicates: ", duplicate_row.shape)

Duplicates:  (0, 25)


In [12]:
#Drops Column that is not helpful for the analysis
lung_data.drop(['Patient Id'], axis=1, inplace=True)

In [13]:
#Renames the cleaned dataset as the finalDataSet
finalDataset = lung_data.reset_index(drop=True)

In [14]:
# checking the distribution of Condition Variable
finalDataset['Level'].value_counts()

1    561
0    439
Name: Level, dtype: int64

Splitting the Features and Target

In [15]:
x_features = finalDataset
x_features = finalDataset.drop(columns='Level', axis=1)

In [16]:
X = x_features
Y = finalDataset['Level']

In [17]:
#1st of line of the dataframe
X[0:1]

Unnamed: 0,Age,Gender,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,...,Coughing of Blood,Fatigue,Weight Loss,Shortness of Breath,Wheezing,Swal0ing Difficulty,Clubbing of Finger Nails,Frequent Cold,Dry Cough,Snoring
0,33,1,2,4,5,4,3,2,2,4,...,4,3,4,2,2,3,1,2,3,4


Splitting the Data into Training data & Test Data

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0,stratify=Y)

In [29]:
print(X.shape, X_train.shape, X_test.shape)

(1000, 23) (800, 23) (200, 23)


Model Training

Decision Tree Classifier

In [30]:
model = DecisionTreeClassifier()

In [31]:
# training the Decision Tree classifier model with Training data
model.fit(X_train, Y_train)

DecisionTreeClassifier()

Model Evaluation

Accuracy Score


In [37]:
y_pred = model.predict(X_test)
y_scores = model.predict_proba(X_test)

In [38]:
from sklearn. metrics import classification_report

print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.75      0.80        88
           1       0.82      0.91      0.86       112

    accuracy                           0.84       200
   macro avg       0.85      0.83      0.83       200
weighted avg       0.84      0.84      0.84       200



In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print('Accuracy:', accuracy_score(Y_test, y_pred))
print("Overall Precision:",precision_score(Y_test, y_pred))
print("Overall Recall:",recall_score(Y_test, y_pred))

Accuracy: 0.84
Overall Precision: 0.8225806451612904
Overall Recall: 0.9107142857142857


In [40]:
# Print the confusion matrix
from sklearn.metrics import confusion_matrix
mcm = confusion_matrix(Y_test, y_pred)
print(mcm)

[[ 66  22]
 [ 10 102]]


In [43]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.84


Building a Predictive System

In [27]:
input_data = (33,1,2,4,5,4,3,2,2,4,3,2,2,4,3,4,2,2,3,1,2,3,4)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# # reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have Lung Cancer')
else:
  print('The Person have Lung cancer')


[0]
The Person does not have Lung Cancer
