In [82]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [113]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("lung cancer survey.csv")
df_no_na = df.dropna()
df_no_na.sort_values(by = "AGE")


#21 year old observation is anomalous. Thus, dropped.
df_no_na = df_no_na[df_no_na["AGE"] > 21]

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
8813,0.0,21.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
8811,0.0,44.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
8959,1.0,44.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
4500,1.0,46.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
6546,1.0,46.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7227,0.0,78.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
806,1.0,78.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8995,1.0,79.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
8874,0.0,81.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0


The code below looks for potential "interaction features" between binary features (except for GENDER) that exhibit worthwhile correlations (threshold > 0.15) with LUNG_CANCER.

It finds that the only worthwhile ones are still those that feature YELLOW_FINGERS and ALCOHOL CONSUMING.

In [93]:
from itertools import combinations

def get_worthwhile_interactions(df, dfname):
    above0_15 = []
    above0_15_index = []
    for i in range(2, 14):
        combolist = list(combinations(df.columns[2:-1], i))
        for eachcombo in combolist:
            df[f"{eachcombo}"] = df[eachcombo[0]] * df[eachcombo[1]]
            if i > 2:
                for j in range(2, len(eachcombo)):
                    df[f"{eachcombo}"] = df[f"{eachcombo}"] * df[eachcombo[j]]
            if df["LUNG_CANCER"].corr(df[f"{eachcombo}"]) > 0.15:
                above0_15.append(df["LUNG_CANCER"].corr(df[f"{eachcombo}"]))
                above0_15_index.append(f"{eachcombo}")
            df.drop(f"{eachcombo}", axis = 1, inplace = True)
    above0_15_combos = pd.Series(above0_15, index = above0_15_index)
    print(f"{dfname} Dataset:\n{above0_15_combos}")

get_worthwhile_interactions(df_no_na, "Base")


Base Dataset:
('YELLOW_FINGERS', 'FATIGUE ')               0.157296
('YELLOW_FINGERS', 'SHORTNESS OF BREATH')    0.155044
('FATIGUE ', 'ALCOHOL CONSUMING')            0.171992
('ALLERGY ', 'ALCOHOL CONSUMING')            0.158326
dtype: float64


The code below creates subsets of the observations split by the following age groups: 40's, 50's, 60's, and 70-80's.

Not the most even split, but not too bad...

In [99]:
df_40s = df_no_na[df_no_na["AGE"] < 50]
df_50s = df_no_na[(df_no_na["AGE"] < 60) & (df_no_na["AGE"] > 50)]
df_60s = df_no_na[(df_no_na["AGE"] < 70) & (df_no_na["AGE"] > 60)]
df_7080s = df_no_na[(df_no_na["AGE"] < 90) & (df_no_na["AGE"] > 70)]

len(df_40s)
len(df_50s)
len(df_60s)
len(df_7080s)

def get_LUNG_CANCER_percentage(df, dfname):
    percentage = round(100*df["LUNG_CANCER"].sum()/len(df), 2)
    print(f"{dfname} Dataset Lung Cancer Percentage:\n{percentage}%")

get_LUNG_CANCER_percentage(df_40s, "40's")
get_LUNG_CANCER_percentage(df_50s, "50's")
get_LUNG_CANCER_percentage(df_60s, "60's")
get_LUNG_CANCER_percentage(df_7080s, "70-80's")


2355

1586

2131

2272

40's Dataset Lung Cancer Percentage:
77.62%
50's Dataset Lung Cancer Percentage:
81.08%
60's Dataset Lung Cancer Percentage:
79.73%
70-80's Dataset Lung Cancer Percentage:
84.11%


The code below looks for worthwhile correlations (threshold > 0.15) between variables for each age-split subset.

It appears that bad lifestyle habits tend to correlate more with LUNG_CANCER for those younger, particularly those in their 40's. This could be worth exploring further.

In [92]:
def get_worthwhile_correlations(df, dfname):
    corre = df.corr()
    vals = []
    indices = []
    for i in range(16):
        for j in range(16):
            if corre.iloc[i, j] > 0.15 and corre.iloc[i, j] < 1 and corre.iloc[i, j] not in vals:
                vals.append(corre.iloc[i, j])
                indices.append(corre.axes[0][i] + "-" + corre.axes[1][j])
    worthwhile = pd.Series(vals, index = indices)
    if len(worthwhile) > 0:
        print(f"{dfname} Dataset:\n{worthwhile}")
    else:
        print(f"{dfname} Dataset:\n–")

get_worthwhile_correlations(df_40s, "40's")
get_worthwhile_correlations(df_50s, "50's")
get_worthwhile_correlations(df_60s, "60's")
get_worthwhile_correlations(df_7080s, "70-80's")
    

40's Dataset:
YELLOW_FINGERS-LUNG_CANCER       0.288240
ALCOHOL CONSUMING-LUNG_CANCER    0.245348
dtype: float64
50's Dataset:
YELLOW_FINGERS-LUNG_CANCER       0.187003
ALCOHOL CONSUMING-LUNG_CANCER    0.206184
dtype: float64
60's Dataset:
YELLOW_FINGERS-LUNG_CANCER       0.204202
ALCOHOL CONSUMING-LUNG_CANCER    0.219129
dtype: float64
70-80's Dataset:
YELLOW_FINGERS-LUNG_CANCER    0.164413
dtype: float64


The code below creates subsets of the observations split by GENDER. 

It appears that women tend to be somewhat more prone to developing LUNG_CANCER.

In [100]:
df_women = df_no_na[df_no_na["GENDER"] == 0]
df_men = df_no_na[df_no_na["GENDER"] == 1]

len(df_women)
len(df_men)

get_LUNG_CANCER_percentage(df_women, "Women")
get_LUNG_CANCER_percentage(df_men, "Men")

4173

4827

Women Dataset Lung Cancer Percentage:
86.51%
Men Dataset Lung Cancer Percentage:
75.31%


The code below looks for worthwhile correlations (threshold > 0.15) between variables for each gender-split subset.

It appears that bad lifestyle habits tend to be quite strongly correlated with LUNG_CANCER for men, but not for women. Crucially, note that YELLOW_FINGERS and ALCOHOL CONSUMING have no strong correlation with GENDER, and also this is despite LUNG_CANCER being more prevalent in women. This is definitely worth discussing.

Bad lifestyle habits vis-a-vis LUNG_CANCER are yet again the only worthwhile correlations.

In [88]:
get_worthwhile_correlations(df_women, "Women")
get_worthwhile_correlations(df_men, "Men")

df_no_na[["GENDER", "YELLOW_FINGERS", "ALCOHOL CONSUMING"]].corr()
df_no_na["BAD LIFESTYLE"] = df_no_na["YELLOW_FINGERS"] * df_no_na["ALCOHOL CONSUMING"]
df_no_na[["GENDER", "BAD LIFESTYLE"]].corr()
df_no_na.drop("BAD LIFESTYLE", axis = 1, inplace= True)

Women Dataset:
–
Men Dataset:
YELLOW_FINGERS-LUNG_CANCER       0.293804
ALCOHOL CONSUMING-LUNG_CANCER    0.291756
dtype: float64


Unnamed: 0,GENDER,YELLOW_FINGERS,ALCOHOL CONSUMING
GENDER,1.0,-0.031982,0.040493
YELLOW_FINGERS,-0.031982,1.0,-0.075707
ALCOHOL CONSUMING,0.040493,-0.075707,1.0


Unnamed: 0,GENDER,BAD LIFESTYLE
GENDER,1.0,0.009919
BAD LIFESTYLE,0.009919,1.0


The code below creates subsets of the observations split by age group and GENDER.

In [89]:
df_women_40s = df_40s[df_40s["GENDER"] == 0]
df_men_40s = df_40s[df_40s["GENDER"] == 1]
df_women_50s = df_50s[df_50s["GENDER"] == 0]
df_men_50s = df_50s[df_50s["GENDER"] == 1]
df_women_60s = df_60s[df_60s["GENDER"] == 0]
df_men_60s = df_60s[df_60s["GENDER"] == 1]
df_women_7080s = df_7080s[df_7080s["GENDER"] == 0]
df_men_7080s = df_7080s[df_7080s["GENDER"] == 1]

len(df_women_40s)
len(df_men_40s)
len(df_women_50s)
len(df_men_50s)
len(df_women_60s)
len(df_men_60s)
len(df_women_7080s)
len(df_men_7080s)

1087

1268

736

850

993

1138

1048

1224

The code below looks for worthwhile correlations (threshold > 0.15) between variables for each gender-split subset.

The results further confirm our narrative. Bad lifestyle habits are most strongly correlated with LUNG_CANCER for men in their 40's.

In [90]:
get_worthwhile_correlations(df_women_40s, "Women 40's")
get_worthwhile_correlations(df_men_40s, "Men 40's")
get_worthwhile_correlations(df_women_50s, "Women 50's")
get_worthwhile_correlations(df_men_50s, "Men 50's")
get_worthwhile_correlations(df_women_60s, "Women 60's")
get_worthwhile_correlations(df_men_60s, "Men 60's")
get_worthwhile_correlations(df_women_7080s, "Women 70-80's")
get_worthwhile_correlations(df_men_7080s, "Men 70-80's")

Women 40's Dataset:
–
Men 40's Dataset:
YELLOW_FINGERS-LUNG_CANCER           0.387811
ALLERGY -LUNG_CANCER                 0.154652
ALCOHOL CONSUMING-LUNG_CANCER        0.341955
SWALLOWING DIFFICULTY-LUNG_CANCER    0.159349
dtype: float64
Women 50's Dataset:
WHEEZING-ALCOHOL CONSUMING    0.173804
dtype: float64
Men 50's Dataset:
YELLOW_FINGERS-LUNG_CANCER       0.244936
ALLERGY -LUNG_CANCER             0.216418
ALCOHOL CONSUMING-LUNG_CANCER    0.287947
dtype: float64
Women 60's Dataset:
–
Men 60's Dataset:
YELLOW_FINGERS-LUNG_CANCER           0.258366
ALLERGY -LUNG_CANCER                 0.182561
ALCOHOL CONSUMING-LUNG_CANCER        0.307768
SWALLOWING DIFFICULTY-LUNG_CANCER    0.155467
dtype: float64
Women 70-80's Dataset:
PEER_PRESSURE-SWALLOWING DIFFICULTY    0.153924
dtype: float64
Men 70-80's Dataset:
YELLOW_FINGERS-LUNG_CANCER       0.229933
ALCOHOL CONSUMING-LUNG_CANCER    0.219037
dtype: float64
