<a href="https://colab.research.google.com/github/b-richins92/Cancer_Prediction/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [69]:
import os
import requests
from zipfile import ZipFile
from urllib.request import urlretrieve
import pandas as pd
import numpy as np
import altair as alt
from vega_datasets import data
%matplotlib inline
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import statsmodels.api as sm
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score
#pd.set_option('display.max_columns', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [2]:
def download_file(function, filename, force=False):
    """
    Downloads a file using a function.

    :param function: Function to be used for downloading
    :param filename: File name for saving
    :param force: Whether to overwrite an existing file
    """
    if force or not os.path.exists(filename):
        function(filename)

def download_brfss(filename):
    """
    Downloads the cdc survey dataset.

    :param filename: File name for saving
    """
    def download(filename):
        """
        Actual download logic for the personal cdc survey dataset

        :param filename: File name for saving
        """
        url = 'https://www.cdc.gov/brfss/annual_data/2022/files/LLCP2022ASC.zip'
        urlretrieve(url, filename)

    download_file(download, filename)
    with ZipFile(filename, 'r') as zObject:
        zObject.extractall(path='./cdc_survey_unzipped/')


In [3]:
download_brfss('cdc_survey.zip')

In [4]:
def extract_data():
    """
    Reads the BRFSS survey input file and extracts the contents.

    :return: Dataframe.
    """
    rows = []
    # Dictionary to map the input values to the desired output values
    mapping = {'1': 1, '2': 0}

    with open('./cdc_survey_unzipped/LLCP2022.ASC ', 'r') as f:
        for line in f:
            row = {}

            gender = line[1979].strip()
            if gender in mapping:
                row['Gender'] = mapping[gender]

            age_group = line[1980:1982].strip()
            if age_group:
                if int(age_group) in [range(1,14)]:
                    row['Age Group'] = int(age_group)

            mental_health= line[103:105].strip()
            if mental_health:
                if int(mental_health) <=30:
                    row['Num of Bad Mental Health Days']= int(mental_health)
                elif int(mental_health) == 88:
                    row['Num of Bad Mental Health Days']= 0

            health_care_access = line[107:109].strip()
            if health_care_access:
                if health_care_access == '1':
                    row['Health Care Access'] = 'from_employer'
                elif health_care_access == '2':
                    row['Health Care Access'] = 'from_own_pocket'
                elif health_care_access == '3':
                    row['Health Care Access'] = 'medicare'
                elif health_care_access == '4':
                    row['Health Care Access'] = 'medigap'
                elif health_care_access == '5':
                    row['Health Care Access'] = 'medicaid'
                elif health_care_access == '6':
                    row['Health Care Access'] = 'children_health_insurance_program'
                elif health_care_access == '7':
                    row['Health Care Access'] = 'military'
                elif health_care_access == '8':
                    row['Health Care Access'] = 'indian_health'
                elif health_care_access == '9':
                    row['Health Care Access'] = 'state_sponsored'
                elif health_care_access == '10':
                    row['Health Care Access'] = 'other_government'
                elif health_care_access == '88':
                    row['Health Care Access'] = 'no_coverage'

            could_afford_doctor = line[110].strip()
            if could_afford_doctor in mapping:
              row['Could Afford Doctor'] = mapping[could_afford_doctor]

            # years_since_last_checkup = line[111].strip()
            # if years_since_last_checkup:
            #     if int(years_since_last_checkup) == 1:
            #         row['Years Since Last Checkup'] = 'within_past_year'
            #     elif int(years_since_last_checkup) == 2:
            #         row['Years Since Last Checkup'] = 'within_past_two_years'
            #     elif int(years_since_last_checkup) == 3:
            #         row['Years Since Last Checkup'] = 'within_past_five_years'
            #     elif int(years_since_last_checkup) == 4:
            #         row['Years Since Last Checkup'] = 'five_or_more_years'


            years_since_last_checkup = line[111].strip()
            if years_since_last_checkup:
                if int(years_since_last_checkup) == 1:
                    row['Years Since Last Checkup'] = 1
                elif int(years_since_last_checkup) == 2:
                    row['Years Since Last Checkup'] = 2
                elif int(years_since_last_checkup) == 3:
                    row['Years Since Last Checkup'] = 3
                elif int(years_since_last_checkup) == 4:
                    row['Years Since Last Checkup'] = 4

            exercise_past_30_days = line[112].strip()
            if exercise_past_30_days in mapping:
              row['Exercise in Past 30 Days'] = mapping[exercise_past_30_days]

            hours_of_sleeping = line[113:115].strip()
            if hours_of_sleeping:
                if int(hours_of_sleeping) <= 24:
                    row['Hours of Sleeping'] = int(hours_of_sleeping)

            heart_attack = line[117].strip()
            if heart_attack in mapping:
                row['Heart Attack'] = mapping[heart_attack]

            heart_disease = line[118].strip()
            if heart_disease in mapping:
                row['Heart Disease'] = mapping[heart_disease]

            stroke = line[119].strip()
            if stroke in mapping:
                row['Stroke'] = mapping[stroke]

            asthma = line[120].strip()
            if asthma in mapping:
                row['Asthma'] = mapping[asthma]

            cancer=line[123].strip()
            if cancer in mapping:
                row['Cancer'] = mapping[cancer]

            depression = line[125].strip()
            if depression in mapping:
                row['Depression'] = mapping[depression]

            kidney_disease = line[126].strip()
            if kidney_disease in mapping:
               row['Kidney Disease'] = mapping[kidney_disease]

            arthritis = line[1911].strip()
            if arthritis in mapping:
                row['Arthritis'] = mapping[arthritis]

            diabetes = line[128].strip()
            if diabetes:
                if diabetes == '1':
                    row['Diabetes'] = 1
                elif diabetes in ['2', '3', '4']:
                    row['Diabetes'] = 0

            marital_status =  line[167].strip()
            if marital_status:
                if marital_status == '1':
                    row['Married'] = 1
                elif marital_status in ['2','3','4','5','6']:
                    row["Married"] = 0

            income = line[185:187].strip()
            if income:
                if int(income) in list(range(1,12)):
                    row['Income'] = int(income)

            weight_pounds = line[188:192].strip()
            if weight_pounds:
                if (int(weight_pounds) >= 50) and (int(weight_pounds) <= 776):
                    row['Weight in Pounds'] = int(weight_pounds)

            height_in_inches = line[1986:1989].strip()
            if height_in_inches:
                row['Height in Inches'] = int(height_in_inches)

            deaf = line[196].strip()
            if deaf in mapping:
                row['Deaf'] = mapping[deaf]

            blind = line[197].strip()
            if blind in mapping:
                row['Blind'] = mapping[blind]

            mammogram = line[202].strip()
            if gender == '1':           #making certain answers dependent on gender
                row['Mammogram'] = 0
            elif mammogram in mapping:
                row['Mammogram'] = mapping[mammogram]

            cervical_screening = line[204].strip()
            if gender == '1':           #making certain answers dependent on gender
                row['Cervical Screening'] = 0
            elif cervical_screening in mapping:
                row['Cervical_screening'] = mapping[cervical_screening]

            colonoscopy_or_sigmoidoscopy = line[209].strip()
            if colonoscopy_or_sigmoidoscopy in mapping:
                row['Colonoscopy/Sigmoidoscopy'] = mapping[colonoscopy_or_sigmoidoscopy]

            smoked_100 = line[222].strip()
            if smoked_100 in mapping:
                row['Smoked 100'] = mapping[smoked_100]

            smoke_currently = line[223].strip()
            if smoked_100 == '2':               #making certain answers dependent on smoked_100
                row['Currently Smoke'] = 0
            elif smoke_currently:
                if (smoke_currently == 1) or (smoke_currently == 2):
                    row['Currently Smoke'] == 1

            smokeless_tobacco = line[224].strip()
            if smokeless_tobacco:
                if (smokeless_tobacco == '1') or (smokeless_tobacco == '2'):
                    row['Smokeless Tobacco'] = 1
                elif smokeless_tobacco == '3':
                    row['Smokeless Tobacco'] = 0

            vape_products = line[225].strip()
            if vape_products:
                if (smokeless_tobacco == '2') or (smokeless_tobacco == '3'):
                    row['Smokeless Tobacco'] = 1
                elif (smokeless_tobacco == '1') or (smokeless_tobacco == '4'):
                    row['Smokeless Tobacco'] = 0

            started_smoking_age = line[226:229].strip()
            if smoked_100 == '2':                #making certain answers dependent on smoked_100
                row['Age Started Smoking'] = 0
            elif started_smoking_age:
                if int(started_smoking_age) in [range(1,101)]:
                    row['Age Started Smoking'] = int(started_smoking_age)

            cigarettes_per_day = line[232:235].strip()
            if smoked_100 == '2':                #making certain answers dependent on smoked_100
                row['Cigarettes per Day'] = 0
            elif cigarettes_per_day:
                if int(cigarettes_per_day) <= 300:
                    row['Cigarettes per Day'] = int(cigarettes_per_day)

            ct_scan = line[235].strip()
            if ct_scan in mapping:
               row['CT Scan'] = mapping[ct_scan]

            days_drinking = line[238:241].strip()
            if days_drinking:
                if int(days_drinking) in [range(101,200)]:
                    row['Days Drinking'] = int(days_drinking)-100
                elif int(days_drinking) in [range(201,300)]:
                    row['Days Drinking'] = (int(days_drinking)-200)*7/30
                elif int(days_drinking) == 888:
                    row['Days Drinking'] = 0

            num_drinks_per_sesh = line[241:243].strip()
            if days_drinking == '888':       # making certain answer dependent on drinks_last_week
                row['Drinks per Session'] = 0
            elif num_drinks_per_sesh:
                if int(num_drinks_per_sesh) <= 76:
                    row['Drinks per Session'] = int(num_drinks_per_sesh)
                elif int(num_drinks_per_sesh) == 88:
                    row['Drinks per Session'] = 0

            flu_vaccine = line[247].strip()
            if flu_vaccine in mapping:
                row["Flu Vaccine"] = mapping[flu_vaccine]

            pneumonia_vaccine = line[254].strip()
            if pneumonia_vaccine in mapping:
                row["Pneumonia Vaccine"] = mapping[pneumonia_vaccine]

            tetanus_last_10_years = line[255].strip()
            if tetanus_last_10_years:
                if tetanus_last_10_years in ['1','2','3']:
                    row["Tetanus Last 10 Years"] = 1
                elif tetanus_last_10_years == '4':
                    row['Tetanus Last 10 Years'] = 0

            had_COVID = line[264].strip()
            if had_COVID:
                if (had_COVID == '1') or (had_COVID == '3'):
                    row["Had COVID"] = 1
                elif had_COVID == '2':
                    row['Had COVID'] = 0

            long_COVID = line[265].strip()
            if had_COVID == '2':         #making certain answers dependent on had_covid
                long_COVID == 0
            elif long_COVID in mapping:
                row['Long COVID'] = mapping[long_COVID]

            insulin = line[271].strip()
            # making certain answers dependent on diabetes
            if (diabetes == '2') or (diabetes == '3') or (diabetes == '4'):
                row['Insulin'] = 0
            elif insulin in mapping:
                row['Insulin'] = mapping[insulin]

            hpv_vac = line[283].strip()
            if int(age_group)< 7:
                row['HPV Vaccine'] = 0
            elif hpv_vac in mapping:
                row['HPV Vaccine'] = mapping[hpv_vac]

            shingles_or_zoster_vac = line[286].strip()
            if int(age_group)<7:
                row['Shingles/Zoster Vaccine'] = 0
            elif shingles_or_zoster_vac in mapping:
                row['Shingles/Zoster Vaccine'] = mapping[shingles_or_zoster_vac]

            COVID_vac = line[287].strip()
            if COVID_vac in mapping:
                row['COVID Vaccine'] = mapping[COVID_vac]

            num_covid_vac = line[289].strip()
            if COVID_vac == '2':        # making answer dependent on COVID_vac
                row['Number of COVID Vaccines'] = 0
            elif num_covid_vac:
                if num_covid_vac in ['1','2','3','4']:
                    row['Number of COVID Vaccines'] = int (num_covid_vac)

            type_of_cancer = line[312:314].strip()
            if cancer == '2':            # making answer dependent on cancer
                type_of_cancer = 0
            elif type_of_cancer:
                if int(type_of_cancer) in [range(1,31)]:
                    type_of_cancer = int(type_of_cancer)

            #prostate_screen = line[325].strip()

            confusion_or_memory_loss = line[330].strip()
            if confusion_or_memory_loss in mapping:
                row['Confusion/Memory Loss'] = mapping[confusion_or_memory_loss]

            #Start of Adverse childhood section
            lived_with_mentally_ill = line[347].strip()
            if lived_with_mentally_ill in mapping:
                row['Lived w/ Mentally Ill'] = mapping[lived_with_mentally_ill]

            lived_with_alcoholic = line[348].strip()
            if lived_with_alcoholic in mapping:
                row['Lived w/ Alcoholic'] = mapping[lived_with_alcoholic]

            lived_with_drug_addict = line[349].strip()
            if lived_with_drug_addict in mapping:
                row['Lived w/ Drug Addict'] = mapping[lived_with_drug_addict]

            lived_jailed_person = line[350].strip()
            if lived_jailed_person in mapping:
                row['Lived w/ Jailed Person'] = mapping[lived_jailed_person]

            divorced_parents = line[351].strip()
            if divorced_parents in mapping:
                row['Divorced Parents'] = mapping[divorced_parents]

            parents_hit_each_other = line[352].strip()
            if parents_hit_each_other in mapping:
                row['Parents Hit Each Other'] = mapping[parents_hit_each_other]

            hurt_by_parent = line[353].strip()
            if hurt_by_parent in mapping:
                row['Hurt by Parent'] = mapping[hurt_by_parent]

            parent_swore_at_child = line[354].strip()
            if parent_swore_at_child in mapping:
                row['Swore at Child'] = mapping[parent_swore_at_child]

            sexually_abused = line[355].strip()
            if sexually_abused in mapping:
                row['Sexually Abused'] = mapping[sexually_abused]
            # End adverse childhood section

            life_satisfaction = line[360].strip()
            if life_satisfaction:
                if life_satisfaction == '1':
                    row['Life Satisfaction'] = 1
                elif life_satisfaction == '2':
                    row['Life Satisfaction'] = 2
                elif life_satisfaction == '3':
                    row['Life Satisfaction'] = 3
                elif life_satisfaction == '4':
                    row['Life Satisfaction'] = 4

            emotional_support = line[361].strip()
            if emotional_support:
                if int(emotional_support) in [range(1,6)]:
                    row['Emotional Support'] = int(emotional_support)

            days_smoked_marijuana = line[370:372].strip()
            if days_smoked_marijuana:
                if int(days_smoked_marijuana) <=30:
                    row['Marijuana, Last 30 Days'] = int(days_smoked_marijuana)
                elif int(days_smoked_marijuana) == 88:
                    row['Marijuana, Last 30 Days'] = 0

            ethnicity = line[1975].strip()
            if ethnicity:
                if ethnicity == '1':
                    row['Ethnicity'] = 'White'
                elif ethnicity == '2':
                    row['Ethnicity'] = 'Black'
                elif ethnicity == '3':
                    row['Ethnicity'] = 'American Indian and Alaskan Native'
                elif ethnicity in ['4', '5']:
                    row['Ethnicity'] = 'Asian and Pacific Islander'
                elif ethnicity == '8':
                    row['Ethnicity'] = 'Hispanic'
                elif ethnicity == '7':
                    row['Ethnicity'] = 'Multiracial, non-Hispanic'

            metropolitan = line[1401].strip()
            if metropolitan in mapping:
                row['Metropolitan'] = mapping[metropolitan]

            bmi_category = line[2001].strip()
            if bmi_category:
                if bmi_category == '1':
                    row['BMI Category'] = 'underweight'
                elif bmi_category == '2':
                    row['BMI Category'] = 'normal_weight'
                elif bmi_category == '3':
                    row['BMI Category'] = 'over_weight'
                elif bmi_category == '4':
                    row['BMI Category'] = 'obese'

            education = line[2004].strip()
            if education:
                if education == '1':
                    row['Education'] = 'did_not_graduate_high_school'
                elif education == '2':
                    row['Education'] = 'graduated_high_school'
                elif education == '3':
                    row['Education'] = 'attended_college'
                elif education == '4':
                    row['Education'] = 'graduated_college'

            income_level = line[2005].strip()
            if income_level:
                if income_level == '1':
                    row['Income Level'] = 'less_than_15K'
                elif income_level == '2':
                    row['Income Level'] = 'between_15K_and_25K'
                elif income_level == '3':
                    row['Income Level'] = 'between_25K_and_35K'
                elif income_level == '4':
                    row['Income Level'] = 'between_35K_and_50K'
                elif income_level == '5':
                    row['Income Level'] = 'between_50K_and_100K'
                elif income_level == '6':
                    row['Income Level'] = 'between_100K_and_200K'
                elif income_level == '7':
                    row['Income Level'] = 'more_than_200K'

            rows.append(row)

    df = pd.DataFrame.from_dict(rows)

    return df

In [5]:
df = extract_data()
#pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows', None)
df.head(10)

Unnamed: 0,Gender,General Health,Num of Bad Mental Health Days,Could Afford Doctor,Years Since Last Checkup,Exercise in Past 30 Days,Hours of Sleeping,Heart Attack,Heart Disease,Stroke,...,Lived w/ Alcoholic,Lived w/ Drug Addict,Lived w/ Jailed Person,Divorced Parents,Parents Hit Each Other,Hurt by Parent,Swore at Child,Sexually Abused,"Marijuana, Last 30 Days",Confusion/Memory Loss
0,0,Very good,0.0,0.0,1.0,0.0,8.0,0.0,0.0,0.0,...,,,,,,,,,,
1,0,Excellent,0.0,0.0,,0.0,6.0,0.0,0.0,0.0,...,,,,,,,,,,
2,0,Very good,3.0,0.0,1.0,1.0,5.0,0.0,0.0,0.0,...,,,,,,,,,,
3,0,Excellent,0.0,0.0,1.0,1.0,7.0,0.0,0.0,0.0,...,,,,,,,,,,
4,0,Fair,0.0,0.0,1.0,1.0,9.0,0.0,0.0,0.0,...,,,,,,,,,,
5,1,Poor,0.0,0.0,1.0,0.0,7.0,1.0,0.0,1.0,...,,,,,,,,,,
6,0,Very good,0.0,0.0,1.0,1.0,7.0,0.0,0.0,0.0,...,,,,,,,,,,
7,0,Good,0.0,0.0,1.0,0.0,8.0,0.0,0.0,0.0,...,,,,,,,,,,
8,0,Good,0.0,0.0,1.0,1.0,6.0,0.0,0.0,0.0,...,,,,,,,,,,
9,0,Good,0.0,0.0,1.0,1.0,7.0,0.0,0.0,0.0,...,,,,,,,,,,


In [None]:
def write_df_to_csv(df, filename):
  """
  Writes a pandas df to a CSV file
  Parameters:
  df(pd.Dataframe): the dataframe to be written to csv file
  filename(str): the name of the file to write to CSV
  """
  try:
    df.to_csv(filename, index=False)
  except Exception as e:
    print(f'Error occured: {e}')

In [None]:
#filename = 'copy_output.csv'
#write_df_to_csv(df, filename)

In [None]:
def load_data():
  df = pd.read_csv('copy_output.csv')
  return df

In [None]:
#new_df = load_data()
#new_df.head(10)

In [6]:
completion_percentage = df.isnull().sum()/len(df) * 100
complete_columns_percentage = completion_percentage[completion_percentage <=20]
complete_columns = complete_columns_percentage.index
len(complete_columns)

36

In [7]:
dropped_df= df[complete_columns].copy()

In [111]:
one_hot_df= pd.get_dummies(dropped_df.dropna())

important_features = ['Weight in Pounds','Cigarettes per Day','Height in Inches','Hours of Sleeping','Num of Bad Mental Health Days',\
                      'Drinks per Session','General Health_Poor','Ethnicity_White','Pneumonia Vaccine','Cancer']
one_hot_df = one_hot_df[important_features]

In [None]:
filename = 'clean_df.csv'
write_df_to_csv(one_hot_df, filename)

In [None]:
#imputer = IterativeImputer(max_iter = 3, random_state=0)
#imputed_df = imputer.fit_transform(one_hot_df)

In [112]:
X=one_hot_df.drop('Cancer', axis=1)
y=one_hot_df.Cancer

X_train, X_test, y_train, y_test = train_test_split(X,y)

smote=SMOTE()
X_train, y_train = smote.fit_resample(X_train,y_train)


In [31]:
y_train.shape

(167337,)

In [113]:
rf=RandomForestClassifier(n_estimators=2,max_depth= 1, max_leaf_nodes = 2,random_state=0)
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
recall_score(y_test,y_predict), precision_score(y_test,y_predict), accuracy_score(y_test,y_predict), f1_score(y_test, y_predict)

(0.7153575256469147,
 0.12286653517422748,
 0.368722996109647,
 0.2097136188167699)

In [110]:
len(y_predict)

55779