In [48]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sfunc import run_forest, one_hot,answer, intensity, quality

In [49]:
lung = pd.read_csv('Datasets/lung_cancer.csv')

In [50]:
lung = lung.sample(random_state=3, n=100000)
print(len(lung))

100000


In [51]:
pd.set_option('display.max_columns', 100)

In [52]:
lung.columns = lung.columns.str.strip().str.replace('_', ' ', regex=False)
print(lung.columns)
lung.head()

Index(['Country', 'Age', 'Gender', 'Smoking Status', 'Second Hand Smoke',
       'Air Pollution Exposure', 'Occupation Exposure', 'Rural or Urban',
       'Socioeconomic Status', 'Healthcare Access', 'Insurance Coverage',
       'Screening Availability', 'Stage at Diagnosis', 'Cancer Type',
       'Mutation Type', 'Treatment Access', 'Clinical Trial Access',
       'Language Barrier', 'Mortality Risk', '5 Year Survival Probability',
       'Delay in Diagnosis', 'Family History', 'Indoor Smoke Exposure',
       'Tobacco Marketing Exposure', 'Final Prediction'],
      dtype='object')


Unnamed: 0,Country,Age,Gender,Smoking Status,Second Hand Smoke,Air Pollution Exposure,Occupation Exposure,Rural or Urban,Socioeconomic Status,Healthcare Access,Insurance Coverage,Screening Availability,Stage at Diagnosis,Cancer Type,Mutation Type,Treatment Access,Clinical Trial Access,Language Barrier,Mortality Risk,5 Year Survival Probability,Delay in Diagnosis,Family History,Indoor Smoke Exposure,Tobacco Marketing Exposure,Final Prediction
453887,Bangladesh,38,Female,Former Smoker,No,Low,No,Rural,High,Limited,No,No,IV,NSCLC,KRAS,Partial,No,No,0.795372,0.332827,Yes,No,No,No,No
237148,South Korea,45,Female,Smoker,No,Low,No,Rural,Middle,Good,Yes,Yes,III,NSCLC,,Full,Yes,No,0.82535,0.510326,Yes,No,No,No,No
367503,Mexico,62,Male,Former Smoker,No,Medium,Yes,Urban,Middle,Poor,Yes,Yes,II,SCLC,KRAS,Full,No,No,0.392478,0.480098,No,Yes,Yes,No,No
437895,Italy,88,Male,Non-Smoker,No,Medium,No,Urban,Low,Good,Yes,Yes,IV,NSCLC,,Full,No,No,0.087066,0.118803,No,No,No,Yes,Yes
158040,South Africa,42,Female,Non-Smoker,Yes,Low,No,Urban,High,Poor,Yes,No,I,NSCLC,KRAS,Full,No,No,0.625993,0.577205,Yes,Yes,No,No,Yes


In [53]:
lung.drop(['Country'], axis=1, inplace=True)

In [54]:
lung.drop(columns = ['Stage at Diagnosis', 'Cancer Type', 'Mutation Type', 'Clinical Trial Access', 'Mortality Risk', 'Mutation Type', 'Language Barrier', 
                  'Mortality Risk', '5 Year Survival Probability', 'Delay in Diagnosis' 
                  ], axis = 1, inplace = True)
print(lung.columns)

Index(['Age', 'Gender', 'Smoking Status', 'Second Hand Smoke',
       'Air Pollution Exposure', 'Occupation Exposure', 'Rural or Urban',
       'Socioeconomic Status', 'Healthcare Access', 'Insurance Coverage',
       'Screening Availability', 'Treatment Access', 'Family History',
       'Indoor Smoke Exposure', 'Tobacco Marketing Exposure',
       'Final Prediction'],
      dtype='object')


In [55]:
lung = one_hot(lung, ['Rural or Urban'])

lung['Gender'] = lung['Gender'].map({'Male': 0, 'Female': 1})

lung['Smoking Status'] = lung['Smoking Status'].map({'Non-Smoker': 0, 'Former Smoker': 1, 'Current Smoker': 2})

lung['Second Hand Smoke'] = lung['Second Hand Smoke'].map(answer)

lung['Air Pollution Exposure'] = lung['Air Pollution Exposure'].map(intensity)

lung['Occupation Exposure'] = lung['Occupation Exposure'].map(answer)

lung['Socioeconomic Status'] = lung['Socioeconomic Status'].map({'Low' : 0, 'Medium': 1, 'High': 2})

lung['Healthcare Access'] = lung['Healthcare Access'].map({'Poor': 0, 'Limited': 1, 'Good': 2})

lung['Insurance Coverage'] = lung['Insurance Coverage'].map(answer)

lung['Screening Availability'] = lung['Screening Availability'].map(answer)

lung['Treatment Access'] = lung['Treatment Access'].map({'Partial': 0, 'Full' : 1})

lung['Indoor Smoke Exposure'] = lung['Indoor Smoke Exposure'].map(answer)

lung['Family History'] = lung['Family History'].map(answer)

lung['Tobacco Marketing Exposure'] = lung['Tobacco Marketing Exposure'].map(answer)
#target variable
lung['Final Prediction'] = lung['Final Prediction'].map(answer)

print(lung.head())

        Age  Gender  Smoking Status  Second Hand Smoke  \
453887   38       1             1.0                  0   
237148   45       1             NaN                  0   
367503   62       0             1.0                  0   
437895   88       0             0.0                  0   
158040   42       1             0.0                  1   

        Air Pollution Exposure  Occupation Exposure  Socioeconomic Status  \
453887                       0                    0                   2.0   
237148                       0                    0                   NaN   
367503                       1                    1                   NaN   
437895                       1                    0                   0.0   
158040                       0                    0                   2.0   

        Healthcare Access  Insurance Coverage  Screening Availability  \
453887                  1                   0                       0   
237148                  2                 

In [56]:
print(lung.corr()['Final Prediction'].sort_values(key = lambda x: x.abs(), ascending = False))

Final Prediction              1.000000
Smoking Status               -0.010391
Indoor Smoke Exposure         0.006055
Occupation Exposure          -0.005012
Screening Availability        0.004332
Healthcare Access             0.004292
Age                          -0.003557
Treatment Access             -0.002091
Rural or Urban_Urban         -0.001767
Rural or Urban_Rural          0.001767
Second Hand Smoke            -0.001657
Air Pollution Exposure       -0.001360
Gender                        0.000218
Tobacco Marketing Exposure    0.000217
Family History               -0.000169
Insurance Coverage            0.000156
Socioeconomic Status          0.000054
Name: Final Prediction, dtype: float64


In [47]:
test_forest = run_forest(lung, 'Final Prediction')

cleaned_lung = lung[['Age', 'Air Pollution Exposure', 'Healthcare Access', 'Second Hand Smoke', 'Final Prediction']]
lung_forest = run_forest(cleaned_lung, 'Final Prediction')

Age                           0.567063
Healthcare Access             0.056332
Air Pollution Exposure        0.048883
Treatment Access              0.037144
Smoking Status                0.033697
Screening Availability        0.029977
Insurance Coverage            0.029388
Second Hand Smoke             0.028768
Tobacco Marketing Exposure    0.028641
Occupation Exposure           0.028177
Socioeconomic Status          0.026939
Gender                        0.026697
Indoor Smoke Exposure         0.025032
Family History                0.024489
Rural or Urban_Rural          0.004401
Rural or Urban_Urban          0.004373
dtype: float64
0.7663454958233307
CV Accuracy: 0.766313552976227 (+/- 0.0012138601784550864)

[[0.47       0.53      ]
 [0.91779365 0.08220635]
 [0.66416667 0.33583333]
 [0.42728571 0.57271429]
 [0.84       0.16      ]
 [0.7385     0.2615    ]
 [0.74598932 0.25401068]
 [0.91833333 0.08166667]
 [0.689      0.311     ]
 [0.9643413  0.0356587 ]]
Age                       0.836