<a href="https://colab.research.google.com/github/angelahjhong/project-2/blob/main/CHD_traincleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/content/fhs_train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1267,1,58,1.0,0,0.0,0.0,0,0,0,220.0,143.0,104.0,29.85,75,87.0,1
1,1209,0,40,1.0,1,15.0,0.0,0,0,0,199.0,122.0,82.0,22.16,85,77.0,0
2,2050,0,52,1.0,0,0.0,0.0,0,0,0,275.0,112.0,71.0,25.68,80,,0
3,1183,1,38,2.0,1,43.0,0.0,0,1,0,170.0,130.0,94.0,23.9,110,75.0,0
4,3225,0,43,1.0,0,0.0,0.0,0,0,0,202.0,124.0,92.0,21.26,75,74.0,0


In [3]:
df.columns

Index(['Unnamed: 0', 'sex', 'age', 'education', 'currentSmoker', 'cigsPerDay',
       'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [4]:
df.shape

(3180, 17)

**Dropping unused variables**

In [5]:
# unnamed column is not needed in our data set
df.drop(columns=['Unnamed: 0'], inplace=True)

**Cleaning sex variable**

In [6]:
# sex variable has no nan values, however is coded a binary when it should be 1=male or 0=female
df['sex'].info()
df['sex'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: sex
Non-Null Count  Dtype
--------------  -----
3180 non-null   int64
dtypes: int64(1)
memory usage: 25.0 KB


sex
0    1803
1    1377
Name: count, dtype: int64

In [7]:
# recoded sex
df['sex'] = df['sex'].replace([0], 'Female')
df['sex'] = df['sex'].replace([1], 'Male')

**Cleaning age variable**

In [8]:
# age variable does not need cleaning - has no nan values and naming convention is correct
df['age'].info()
df['age'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: age
Non-Null Count  Dtype
--------------  -----
3180 non-null   int64
dtypes: int64(1)
memory usage: 25.0 KB


array([58, 40, 52, 38, 43, 56, 54, 47, 61, 59, 64, 45, 51, 63, 57, 48, 46,
       60, 39, 53, 49, 50, 44, 68, 41, 65, 55, 34, 35, 37, 66, 42, 62, 67,
       36, 33, 32, 70, 69])

**Cleaning currentSmoker variable**

In [9]:
# currentSmoker variable has no nan values, however is coded a binary when it should be 1=yes or 0=no
df['currentSmoker'].info()
df['currentSmoker'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: currentSmoker
Non-Null Count  Dtype
--------------  -----
3180 non-null   int64
dtypes: int64(1)
memory usage: 25.0 KB


currentSmoker
0    1619
1    1561
Name: count, dtype: int64

In [10]:
# recoded currentSmoker
df['currentSmoker'] = df['currentSmoker'].replace([0], 'No')
df['currentSmoker'] = df['currentSmoker'].replace([1], 'Yes')

**Cleaning cigsPerDay variable**

In [11]:
# cigsPerDay has 24 nan values
df['cigsPerDay'].info()
df['cigsPerDay'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: cigsPerDay
Non-Null Count  Dtype  
--------------  -----  
3156 non-null   float64
dtypes: float64(1)
memory usage: 25.0 KB


array([ 0., 15., 43., 20.,  9.,  3., 10., 40.,  5., nan, 25., 18.,  6.,
        2., 30.,  1.,  8.,  7., 12., 23., 35., 29., 11., 16., 14., 60.,
       70., 19.,  4., 45., 50., 17., 38., 13.])

In [12]:
cigs_median = df['cigsPerDay'].median()
print(cigs_median)

0.0


In [13]:
# fill nan values with median to avoid skewed data
df['cigsPerDay'] = df['cigsPerDay'].fillna(cigs_median)

**Cleaning prevalentStroke variable**

In [14]:
# prevalentStroke variable has no nan values, however is coded a binary when it should be 1=yes or 0=no
df['prevalentStroke'].info()
df['prevalentStroke'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: prevalentStroke
Non-Null Count  Dtype
--------------  -----
3180 non-null   int64
dtypes: int64(1)
memory usage: 25.0 KB


prevalentStroke
0    3159
1      21
Name: count, dtype: int64

In [15]:
# recoded prevalentStroke
df['prevalentStroke'] = df['prevalentStroke'].replace([0], 'No')
df['prevalentStroke'] = df['prevalentStroke'].replace([1], 'Yes')

**Cleaning prevalentHyp variable**

In [16]:
# prevalentHyp variable has no nan values, however is coded a binary when it should be 1=yes or 0=no
df['prevalentHyp'].info()
df['prevalentHyp'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: prevalentHyp
Non-Null Count  Dtype
--------------  -----
3180 non-null   int64
dtypes: int64(1)
memory usage: 25.0 KB


prevalentHyp
0    2159
1    1021
Name: count, dtype: int64

In [17]:
# recoded prevalentHyp
df['prevalentHyp'] = df['prevalentHyp'].replace([0], 'No')
df['prevalentHyp'] = df['prevalentHyp'].replace([1], 'Yes')

**Cleaning diabetes variable**

In [18]:
# diabetes variable has no nan values, however is coded a binary when it should be 1=yes or 0=no
df['diabetes'].info()
df['diabetes'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: diabetes
Non-Null Count  Dtype
--------------  -----
3180 non-null   int64
dtypes: int64(1)
memory usage: 25.0 KB


diabetes
0    3097
1      83
Name: count, dtype: int64

In [19]:
# recoded diabetes
df['diabetes'] = df['diabetes'].replace([0], 'No')
df['diabetes'] = df['diabetes'].replace([1], 'Yes')

**Cleaning BMI variable**

In [20]:
# BMI has 15 nan values
df['BMI'].info()
df['BMI'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: BMI
Non-Null Count  Dtype  
--------------  -----  
3165 non-null   float64
dtypes: float64(1)
memory usage: 25.0 KB


array([29.85, 22.16, 25.68, ..., 37.04, 38.31, 26.78])

In [21]:
bmi_median = df['BMI'].median()
print(bmi_median)

25.49


In [22]:
# fill nan values with median to avoid skewed data
df['BMI'] = df['BMI'].fillna(bmi_median)

**Cleaning glucose variable**

In [23]:
# glucose has 103 nan values
df['glucose'].info()
df['glucose'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: glucose
Non-Null Count  Dtype  
--------------  -----  
2895 non-null   float64
dtypes: float64(1)
memory usage: 25.0 KB


array([ 87.,  77.,  nan,  75.,  74., 104.,  69.,  90.,  91.,  81.,  89.,
        86.,  84.,  73.,  95.,  65., 103.,  76.,  85.,  92.,  72.,  62.,
        93.,  68.,  67., 122., 102.,  79.,  66.,  83.,  78.,  64.,  63.,
        58.,  71., 115.,  70.,  96.,  60.,  82.,  94.,  88., 167., 117.,
       215.,  48., 108., 268., 100.,  80.,  99., 135.,  97.,  55., 274.,
        40., 118.,  47.,  56., 112., 248., 107., 105.,  61., 260., 170.,
       148., 120., 225.,  98.,  59., 332., 110., 206., 129.,  57., 114.,
       101., 113., 111., 127., 121.,  45., 147.,  50., 244., 106., 394.,
       173., 116., 137., 145., 123.,  44., 172.,  54., 142., 348., 119.,
       320.,  53., 136., 210.,  43., 144., 292., 125., 294., 130., 131.,
       255., 160.,  52., 207., 297., 216., 166., 109., 126., 235., 140.,
       270., 250., 193., 256., 150., 202., 254., 370., 191., 186.])

**Cleaning TenYearCHD variable**

In [24]:
# TenYearCHD variable has no nan values, however is coded a binary when it should be 1=at risk or 0=not at risk
df['TenYearCHD'].info()
df['TenYearCHD'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 3180 entries, 0 to 3179
Series name: TenYearCHD
Non-Null Count  Dtype
--------------  -----
3180 non-null   int64
dtypes: int64(1)
memory usage: 25.0 KB


TenYearCHD
0    2693
1     487
Name: count, dtype: int64

In [25]:
# recoded TenYearCHD
df['TenYearCHD'] = df['TenYearCHD'].replace([0], 'Not at Risk')
df['TenYearCHD'] = df['TenYearCHD'].replace([1], 'At Risk')

In [26]:
# put cleaned variables into new df
clean_columns = ['sex','age','currentSmoker', 'cigsPerDay','prevalentStroke','prevalentHyp','diabetes','BMI','glucose','TenYearCHD']
cleaned_df = df[clean_columns].copy()

In [27]:
cleaned_df.head()

Unnamed: 0,sex,age,currentSmoker,cigsPerDay,prevalentStroke,prevalentHyp,diabetes,BMI,glucose,TenYearCHD
0,Male,58,No,0.0,No,No,No,29.85,87.0,At Risk
1,Female,40,Yes,15.0,No,No,No,22.16,77.0,Not at Risk
2,Female,52,No,0.0,No,No,No,25.68,,Not at Risk
3,Male,38,Yes,43.0,No,Yes,No,23.9,75.0,Not at Risk
4,Female,43,No,0.0,No,No,No,21.26,74.0,Not at Risk


In [28]:
cleaned_df.to_csv('cleaned_train.csv', index=False)