In [37]:
import pandas as pd
df = pd.read_csv('binarylung.csv')
df.columns = df.columns.str.strip()

### Exploratory Data Analysis

In [6]:
df.shape

(309, 16)

In [7]:
df.head(5)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [8]:
df.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')

In [10]:
df.dtypes

GENDER                   object
AGE                       int64
SMOKING                   int64
YELLOW_FINGERS            int64
ANXIETY                   int64
PEER_PRESSURE             int64
CHRONIC DISEASE           int64
FATIGUE                   int64
ALLERGY                   int64
WHEEZING                  int64
ALCOHOL CONSUMING         int64
COUGHING                  int64
SHORTNESS OF BREATH       int64
SWALLOWING DIFFICULTY     int64
CHEST PAIN                int64
LUNG_CANCER              object
dtype: object

In [11]:
df.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

### Data Visualization

In [12]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import numpy as np

In [13]:
fig = px.histogram(df, x='AGE', nbins=20, title='Distribution of AGE')
fig.show()


In [14]:
fig = px.bar(df, x='GENDER', color='LUNG_CANCER', barmode='group', title='Distribution of GENDER by LUNG_CANCER')
fig.show()


In [16]:
fig = px.box(df, x='GENDER', y='AGE', color='LUNG_CANCER', title='Box Plot of AGE by GENDER and LUNG_CANCER')
fig.show()


In [18]:
fig = px.scatter_matrix(df, dimensions=['AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY'], color='LUNG_CANCER', title='Pair Plot of Selected Features')
fig.show()


In [21]:
fig = px.strip(df, x='LUNG_CANCER', y='AGE', title='Swarm Plot of AGE by LUNG_CANCER')
fig.show()


In [22]:
fig = px.scatter(df, x='AGE', y='SMOKING', color='LUNG_CANCER', title='Scatter Plot of AGE vs SMOKING')
fig.show()


In [23]:
fig = px.box(df, x='LUNG_CANCER', y='AGE', color='LUNG_CANCER', title='Boxen Plot of AGE by LUNG_CANCER')
fig.show()


In [31]:
fig = px.sunburst(df, path=['GENDER', 'SMOKING', 'LUNG_CANCER'], title='Sunburst Chart of GENDER, SMOKING, and LUNG_CANCER')
fig.show()


In [40]:
positive_cases = df[df['LUNG_CANCER'] == 'YES']

symptom_columns = ['SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE', 
                   'CHRONIC DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING', 
                   'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH', 
                   'SWALLOWING DIFFICULTY', 'CHEST PAIN']

symptom_summary = positive_cases.groupby('GENDER')[symptom_columns].sum().reset_index()

symptom_summary_melted = symptom_summary.melt(id_vars='GENDER', var_name='Symptom', value_name='Count')

fig = px.bar(symptom_summary_melted, x='Symptom', y='Count', color='GENDER', 
             title="Gender-wise Positive Cases' Symptoms", barmode='group')

fig.show()


In [39]:
print(df.columns)


Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')


### Data PreProcessing

In [43]:
from scipy import stats


X = df.drop('LUNG_CANCER', axis=1)
z_scores = np.abs(stats.zscore(X.select_dtypes(include=[np.number])))
threshold = 3
outliers = (z_scores > threshold).any(axis=1)

num_outliers = np.sum(outliers)
print(f"Number of outliers detected: {num_outliers}")

if num_outliers > 0:
    outlier_entries = df[outliers]
    print("Outlier entries:")
    print(outlier_entries.head())
else:
    print("No outliers detected.")

Number of outliers detected: 2
Outlier entries:
    GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
22       F   21        2               1        1              1   
238      F   38        1               2        1              1   

     CHRONIC DISEASE  FATIGUE  ALLERGY  WHEEZING  ALCOHOL CONSUMING  COUGHING  \
22                 2        2        2         1                  1         1   
238                2        2        2         2                  1         2   

     SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN LUNG_CANCER  
22                     2                      1           1          NO  
238                    2                      1           2         YES  


In [44]:
X = df.drop('LUNG_CANCER', axis=1)
y = df['LUNG_CANCER']

z_scores = np.abs(stats.zscore(X.select_dtypes(include=[np.number])))

threshold = 3
outliers = (z_scores > threshold).any(axis=1)

dfno = df[~outliers]

num_outliers = np.sum(outliers)
print(f"Number of outliers detected: {num_outliers}")
print(f"Size of the new dataset (without outliers): {dfno.shape}")

dfno.to_csv('binarylung_no_outliers.csv', index=False)

Number of outliers detected: 2
Size of the new dataset (without outliers): (307, 16)


In [45]:
dfno = pd.read_csv('binarylung_no_outliers.csv')
dfno.head(5)


Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


### Modelling

In [50]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score


X = dfno.drop('LUNG_CANCER', axis=1)  
y = dfno['LUNG_CANCER']

label_encoder = LabelEncoder()
X['GENDER'] = label_encoder.fit_transform(X['GENDER'])
y = label_encoder.fit_transform(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


param_grids = {
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'Support Vector Machine': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    }
}

results = []

for model_name, grid in param_grids.items():
    grid_search = GridSearchCV(estimator=grid['model'], param_grid=grid['params'], 
                               scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    y_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    results.append({
        'Model': model_name,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'Precision': report['1']['precision'], 
        'Recall': report['1']['recall'],       
        'F1-Score': report['1']['f1-score']     
    })

results_df = pd.DataFrame(results)

print("Model Comparison Results with GridSearch:")
print(results_df)


Model Comparison Results with GridSearch:
                    Model                                    Best Parameters  \
0           Random Forest  {'max_depth': None, 'min_samples_split': 2, 'n...   
1     Logistic Regression                    {'C': 1, 'solver': 'liblinear'}   
2  Support Vector Machine                       {'C': 1, 'kernel': 'linear'}   
3     K-Nearest Neighbors          {'n_neighbors': 5, 'weights': 'distance'}   

   Accuracy  Precision    Recall  F1-Score  
0  0.919355   0.948276  0.964912  0.956522  
1  0.919355   0.948276  0.964912  0.956522  
2  0.935484   0.964912  0.964912  0.964912  
3  0.919355   0.964286  0.947368  0.955752  


In [52]:

from joblib import dump

dfno = pd.read_csv('binarylung_no_outliers.csv')

X = dfno.drop('LUNG_CANCER', axis=1)  
y = dfno['LUNG_CANCER']

label_encoder = LabelEncoder()
X['GENDER'] = label_encoder.fit_transform(X['GENDER'])
y = label_encoder.fit_transform(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=2, random_state=42),
    'Logistic Regression': LogisticRegression(C=1, solver='liblinear'),
    'Support Vector Machine': SVC(C=1, kernel='linear'),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, weights='distance')
}

# Train and save models
for model_name, model in models.items():
    model.fit(X_train, y_train)
    dump(model, f'{model_name.replace(" ", "_").lower()}.joblib')

# Also save the scaler
dump(scaler, 'scaler.joblib')


['scaler.joblib']