## Importing Libraries

In [242]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [243]:
data = pd.read_csv("../survey lung cancer.csv")
data

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,F,56,1,1,1,2,2,2,1,1,2,2,2,2,1,YES
305,M,70,2,1,1,1,1,2,2,2,2,2,2,1,2,YES
306,M,58,2,1,1,1,1,1,2,2,2,2,1,1,2,YES
307,M,67,2,1,2,1,1,2,2,1,2,2,2,1,2,YES


### Data Categorization
GENDER: M(male), F(female)
AGE: Age of the patient
SMOKING: YES=2 , NO=1.
YELLOW_FINGERS: YES=2 , NO=1.
ANXIETY: YES=2 , NO=1.
PEER_PRESSURE : YES=2 , NO=1.
CHRONIC DISEASE: YES=2 , NO=1.
FATIGUE : YES=2 , NO=1.
ALLERGY: YES=2 , NO=1.
WHEEZING: YES=2 , NO=1.
ALCOHOL CONSUMING: YES=2 , NO=1.
COUGHING: YES=2 , NO=1.
SHORTNESS OF BREATH: YES=2 , NO=1.
SWALLOWING DIFFICULTY:YES=2 , NO=1.
CHEST PAIN:YES=2 , NO=1.
LUNG_CANCER:YES=2 , NO=1.



In [244]:
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [245]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [246]:
data.isnull()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
305,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
306,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
307,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Data Processing
#### Converting categorical variables of the dataset into numerical variables - using ONE HOT ENCODING technique

In [247]:
data.GENDER.unique()

array(['M', 'F'], dtype=object)

In [248]:
data.LUNG_CANCER.unique()

array(['YES', 'NO'], dtype=object)

In [249]:
data['GENDER']= data['GENDER'].apply({'M':1, 'F':0}.get)

In [250]:
data['LUNG_CANCER']= data['LUNG_CANCER'].apply({'YES':1, 'NO':0}.get)

In [251]:
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,0,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,0,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


### Dividing the dataset into dependent and independent columns

In [252]:
X= data.drop('LUNG_CANCER',axis=1)
y=data['LUNG_CANCER']
X

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2
2,0,59,1,1,1,2,1,2,1,2,1,2,2,1,2
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2
4,0,63,1,2,1,1,1,1,1,2,1,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,0,56,1,1,1,2,2,2,1,1,2,2,2,2,1
305,1,70,2,1,1,1,1,2,2,2,2,2,2,1,2
306,1,58,2,1,1,1,1,1,2,2,2,2,1,1,2
307,1,67,2,1,2,1,1,2,2,1,2,2,2,1,2


In [253]:
y

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64

### Splitting the dataset into training and testing set
#### 20% of the dataset will be used for testing(evaluation) and 80% of the data will be used for training purposes

In [254]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

### Logistic Regression

In [255]:
logmodel=LogisticRegression()
logmodel.fit(X_train,y_train)
y_pred=logmodel.predict(X_test)
#calculating accuracy of the Classification model
log_cm=confusion_matrix(y_test,y_pred)
log_ac=accuracy_score(y_test,y_pred)
print(log_cm)
print(log_ac)

[[ 3  3]
 [ 3 53]]
0.9032258064516129


### Decision Tree Algorithm

In [256]:
tree_classifier=DecisionTreeClassifier()
tree_classifier.fit(X_train,y_train)
predictions=tree_classifier.predict(X_test)
#calculating accuracy of the Classification model
dc_cm=confusion_matrix(y_test,predictions)
dc_ac=accuracy_score(y_test,predictions)
print(dc_cm)
print(dc_ac)

[[ 4  2]
 [ 5 51]]
0.8870967741935484


### KNN Algorithm

In [257]:
knn=KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train,y_train)
predictions=knn.predict(X_test)
#calculating accuracy of the Classification model
knn_cm=confusion_matrix(y_test,predictions)
knn_ac=accuracy_score(y_test,predictions)
print(knn_cm)
print(knn_ac)

[[ 0  6]
 [ 2 54]]
0.8709677419354839


### Random Forest Algorithm

In [258]:
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
#calculating accuracy of the Classification model
rf_cm=confusion_matrix(y_test,y_pred)
rf_ac=accuracy_score(y_test,y_pred)
print(rf_cm)
print(rf_ac)

[[ 3  3]
 [ 3 53]]
0.9032258064516129


### Support Vector Machine

In [259]:
svc_model=SVC()
svc_model.fit(X_train,y_train)
predictions=svc_model.predict(X_test)
#calculating accuracy of the Classification model
svm_cm=confusion_matrix(y_test,predictions)
svm_ac=accuracy_score(y_test,predictions)
print(svm_cm)
print(svm_ac)

[[ 0  6]
 [ 0 56]]
0.9032258064516129


In [260]:
print(f"logistic regression Accuracy {log_ac*100}")
print(f"Decision Tree Accuracy {dc_ac*100}")
print(f"KNN Accuracy {knn_ac*100}")
print(f"Random Forest Accuracy {rf_ac*100}")
print(f"Support Vector Machine Accuracy {svm_ac*100}")

logistic regression Accuracy 90.32258064516128
Decision Tree Accuracy 88.70967741935483
KNN Accuracy 87.09677419354838
Random Forest Accuracy 90.32258064516128
Support Vector Machine Accuracy 90.32258064516128


### To find the Minimum and Maximum value of the dataset

In [266]:
# For Multiple columns to exclude use,
# cols = [x for x in df.columns if x not in ('mpg', 'name')]
# cols

cols = [X for X in data.columns if X not in ('LUNG_CANCER')]
cols

print("{")
for i,name in enumerate(cols):
    print(f'"{name}":{{"min":{data[name].min()},"max":{data[name].max()}}}{"," if i<(len(cols)-1) else ""}')
print("}")

{
"GENDER":{"min":0,"max":1},
"AGE":{"min":21,"max":87},
"SMOKING":{"min":1,"max":2},
"YELLOW_FINGERS":{"min":1,"max":2},
"ANXIETY":{"min":1,"max":2},
"PEER_PRESSURE":{"min":1,"max":2},
"CHRONIC DISEASE":{"min":1,"max":2},
"FATIGUE ":{"min":1,"max":2},
"ALLERGY ":{"min":1,"max":2},
"WHEEZING":{"min":1,"max":2},
"ALCOHOL CONSUMING":{"min":1,"max":2},
"COUGHING":{"min":1,"max":2},
"SHORTNESS OF BREATH":{"min":1,"max":2},
"SWALLOWING DIFFICULTY":{"min":1,"max":2},
"CHEST PAIN":{"min":1,"max":2}
}


In [261]:
X

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2
2,0,59,1,1,1,2,1,2,1,2,1,2,2,1,2
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2
4,0,63,1,2,1,1,1,1,1,2,1,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,0,56,1,1,1,2,2,2,1,1,2,2,2,2,1
305,1,70,2,1,1,1,1,2,2,2,2,2,2,1,2
306,1,58,2,1,1,1,1,1,2,2,2,2,1,1,2
307,1,67,2,1,2,1,1,2,2,1,2,2,2,1,2


In [262]:
data_new = {'GENDER': 0, 'AGE': 65, 'SMOKING': 1, 'YELLOW_FINGERS': 2, 'ANXIETY': 2,
            'PEER_PRESSURE': 1, 'CHRONIC DISEASE': 1, 'FATIGUE': 2, 'ALLERGY': 2, 'WHEEZING': 2,
            'ALCOHOL CONSUMING': 2, 'COUGHING': 2, 'SHORTNESS OF BREATH': 2, 'SWALLOWING DIFFICULTY':2, 'CHEST PAIN':1}
index = [1]  # serial number

In [263]:
my_data = pd.DataFrame(data_new, index)

In [264]:
print(my_data)

   GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
1       0   65        1               2        2              1   

   CHRONIC DISEASE  FATIGUE  ALLERGY  WHEEZING  ALCOHOL CONSUMING  COUGHING  \
1                1        2        2         2                  2         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN  
1                    2                      2           1  


In [265]:
medical_details = svc_model.predict(my_data)
print(f" The Lung Cancer prediction is :{medical_details}")

 The Lung Cancer prediction is :[1]
