<a href="https://colab.research.google.com/github/angelahjhong/project-2/blob/main/CHD_testcleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
import pandas as pd
import numpy as np

In [61]:
df = pd.read_csv('/content/fhs_test.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,674,0,58,1.0,1,20.0,0.0,0,0,0,,126.0,77.0,30.08,78.0,,0
1,4070,0,51,3.0,0,0.0,0.0,0,0,0,264.0,135.0,83.0,26.68,60.0,74.0,0
2,3150,0,44,2.0,1,9.0,0.0,0,1,0,,147.5,96.0,30.57,78.0,,1
3,1695,0,40,2.0,1,20.0,0.0,0,0,0,271.0,138.5,88.0,27.24,80.0,,1
4,2692,1,58,2.0,1,20.0,0.0,0,0,0,207.0,110.0,80.0,23.55,78.0,78.0,0


In [62]:
df.columns

Index(['Unnamed: 0', 'sex', 'age', 'education', 'currentSmoker', 'cigsPerDay',
       'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [63]:
df.shape

(1060, 17)

**Dropping unused variables**

In [64]:
# unnamed column is not needed in our data set
df.drop(columns=['Unnamed: 0'], inplace=True)

**Cleaning sex variable**

In [65]:
# sex variable has no nan values, however is coded a binary when it should be 1=male or 0=female
df['sex'].info()
df['sex'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: sex
Non-Null Count  Dtype
--------------  -----
1060 non-null   int64
dtypes: int64(1)
memory usage: 8.4 KB


sex
0    617
1    443
Name: count, dtype: int64

In [66]:
# recoded sex
df['sex'] = df['sex'].replace([0], 'Female')
df['sex'] = df['sex'].replace([1], 'Male')

**Cleaning age variable**

In [67]:
# age variable does not need cleaning - has no nan values and naming convention is correct
df['age'].info()
df['age'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: age
Non-Null Count  Dtype
--------------  -----
1060 non-null   int64
dtypes: int64(1)
memory usage: 8.4 KB


array([58, 51, 44, 40, 53, 41, 50, 42, 45, 52, 57, 46, 38, 47, 48, 56, 66,
       68, 59, 54, 49, 39, 61, 62, 67, 43, 64, 37, 34, 35, 55, 36, 63, 65,
       60, 69])

**Cleaning currentSmoker variable**

In [68]:
# currentSmoker variable has no nan values, however is coded a binary when it should be 1=yes or 0=no
df['currentSmoker'].info()
df['currentSmoker'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: currentSmoker
Non-Null Count  Dtype
--------------  -----
1060 non-null   int64
dtypes: int64(1)
memory usage: 8.4 KB


currentSmoker
1    534
0    526
Name: count, dtype: int64

In [69]:
# recoded currentSmoker
df['currentSmoker'] = df['currentSmoker'].replace([0], 'No')
df['currentSmoker'] = df['currentSmoker'].replace([1], 'Yes')

**Cleaning cigsPerDay variable**

In [70]:
# cigsPerDay has 5 nan values
df['cigsPerDay'].info()
df['cigsPerDay'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: cigsPerDay
Non-Null Count  Dtype  
--------------  -----  
1055 non-null   float64
dtypes: float64(1)
memory usage: 8.4 KB


array([20.,  0.,  9.,  5., 35., 30., 10., 40.,  1., 15.,  3., 45., 25.,
       43., 17., 23., 13., 18., 50.,  4.,  2.,  6., nan, 60.,  8., 12.,
        7., 14.])

In [71]:
cigs_median = df['cigsPerDay'].median()
print(cigs_median)

1.0


In [72]:
# fill nan values with median to avoid skewed data
df['cigsPerDay'] = df['cigsPerDay'].fillna(cigs_median)

**Cleaning prevelantStroke variable**

In [73]:
# prevalentStroke variable has no nan values, however is coded a binary when it should be 1=yes or 0=no
df['prevalentStroke'].info()
df['prevalentStroke'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: prevalentStroke
Non-Null Count  Dtype
--------------  -----
1060 non-null   int64
dtypes: int64(1)
memory usage: 8.4 KB


prevalentStroke
0    1056
1       4
Name: count, dtype: int64

In [74]:
# recoded prevalentStroke
df['prevalentStroke'] = df['prevalentStroke'].replace([0], 'No')
df['prevalentStroke'] = df['prevalentStroke'].replace([1], 'Yes')

**Cleaning prevalentHyp variable**

In [75]:
# prevalentHyp variable has no nan values, however is coded a binary when it should be 1=yes or 0=no
df['prevalentHyp'].info()
df['prevalentHyp'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: prevalentHyp
Non-Null Count  Dtype
--------------  -----
1060 non-null   int64
dtypes: int64(1)
memory usage: 8.4 KB


prevalentHyp
0    764
1    296
Name: count, dtype: int64

In [76]:
# recoded prevalentHyp
df['prevalentHyp'] = df['prevalentHyp'].replace([0], 'No')
df['prevalentHyp'] = df['prevalentHyp'].replace([1], 'Yes')

**Cleaning diabetes variable**

In [77]:
# diabetes variable has no nan values, however is coded a binary when it should be 1=yes or 0=no
df['diabetes'].info()
df['diabetes'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: diabetes
Non-Null Count  Dtype
--------------  -----
1060 non-null   int64
dtypes: int64(1)
memory usage: 8.4 KB


diabetes
0    1034
1      26
Name: count, dtype: int64

In [78]:
# recoded diabetes
df['diabetes'] = df['diabetes'].replace([0], 'No')
df['diabetes'] = df['diabetes'].replace([1], 'Yes')

**Cleaning BMI variable**

In [79]:
# BMI has 4 nan values
df['BMI'].info()
df['BMI'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: BMI
Non-Null Count  Dtype  
--------------  -----  
1056 non-null   float64
dtypes: float64(1)
memory usage: 8.4 KB


array([30.08, 26.68, 30.57, 27.24, 23.55, 25.82, 27.8 , 21.98, 29.8 ,
       25.77, 27.3 , 22.32, 33.32, 27.66, 28.57, 24.19, 39.6 , 29.84,
       23.45, 29.08, 20.89, 22.83, 25.09, 28.3 , 23.26, 25.54, 27.56,
       25.5 , 26.48, 25.94, 24.68, 27.22, 26.83, 36.12, 21.02, 32.84,
       23.87, 27.93, 20.68, 22.18, 21.67, 19.72, 24.22, 22.14, 30.74,
       25.65, 23.33, 21.3 , 27.89, 29.5 , 29.38, 20.52, 28.27, 22.35,
       25.75, 28.04, 22.19, 26.07, 26.89, 24.42, 26.84, 20.85, 23.64,
       24.32, 21.35, 20.39, 20.72, 24.35, 30.34, 35.35, 29.48, 26.91,
       26.87, 28.44, 20.02, 28.33, 21.5 , 21.19, 26.59, 25.67, 23.19,
       21.14, 25.48, 28.55, 25.71, 23.09, 21.45, 24.24, 30.16, 23.22,
       20.51, 32.52, 21.11, 26.13, 24.47, 28.4 , 24.71, 32.35, 28.67,
       24.72, 29.19, 29.07, 25.84, 24.49, 20.12, 26.18, 22.85, 36.21,
       27.1 , 24.69, 25.11, 29.43, 21.34, 34.84, 30.43, 25.04, 22.34,
       21.93, 29.42, 17.5 , 23.67, 24.1 , 23.27, 27.45, 24.77, 24.94,
       25.18, 29.35,

In [80]:
bmi_median = df['BMI'].median()
print(bmi_median)

25.14


In [81]:
# fill nan values with median to avoid skewed data
df['BMI'] = df['BMI'].fillna(bmi_median)

**Cleaning glucose variable**

In [82]:
# glucose has 103 nan values
df['glucose'].info()
df['glucose'].unique()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: glucose
Non-Null Count  Dtype  
--------------  -----  
957 non-null    float64
dtypes: float64(1)
memory usage: 8.4 KB


array([ nan,  74.,  78.,  83.,  73.,  82.,  70.,  87., 205.,  64.,  69.,
        80.,  79.,  62.,  90.,  85.,  58., 119.,  67., 163.,  66.,  77.,
        72., 143.,  71.,  99.,  75.,  91.,  89.,  63., 113., 122., 114.,
       100., 112.,  92., 102., 104.,  61., 108.,  81., 177.,  76.,  88.,
        60.,  68., 386.,  93., 117.,  95.,  59., 124., 107.,  65., 103.,
       137.,  86.,  84.,  45.,  98.,  57., 132., 140.,  94., 110.,  97.,
        55., 115., 325., 368.,  96.,  52.,  44., 170., 120., 123., 101.,
       394.,  53., 130., 223.,  56., 126., 105., 156., 136., 183., 173.,
       155., 106., 145.])

In [83]:
glucose_median = df['glucose'].median()
print(glucose_median)

78.0


In [84]:
# fill nan values with median to avoid skewed data
df['glucose'] = df['glucose'].fillna(glucose_median)

**Cleaning TenYearCHD variable**

In [85]:
# TenYearCHD variable has no nan values, however is coded a binary when it should be 1=at risk or 0=not at risk
df['TenYearCHD'].info()
df['TenYearCHD'].value_counts()

<class 'pandas.core.series.Series'>
RangeIndex: 1060 entries, 0 to 1059
Series name: TenYearCHD
Non-Null Count  Dtype
--------------  -----
1060 non-null   int64
dtypes: int64(1)
memory usage: 8.4 KB


TenYearCHD
0    903
1    157
Name: count, dtype: int64

In [86]:
# recoded TenYearCHD
df['TenYearCHD'] = df['TenYearCHD'].replace([0], 'Not at Risk')
df['TenYearCHD'] = df['TenYearCHD'].replace([1], 'At Risk')

In [87]:
# put cleaned variables into new df
clean_columns = ['sex','age','currentSmoker', 'cigsPerDay','prevalentStroke','prevalentHyp','diabetes','BMI','glucose','TenYearCHD']
cleaned_df = df[clean_columns].copy()

In [88]:
cleaned_df.head()

Unnamed: 0,sex,age,currentSmoker,cigsPerDay,prevalentStroke,prevalentHyp,diabetes,BMI,glucose,TenYearCHD
0,Female,58,Yes,20.0,No,No,No,30.08,78.0,Not at Risk
1,Female,51,No,0.0,No,No,No,26.68,74.0,Not at Risk
2,Female,44,Yes,9.0,No,Yes,No,30.57,78.0,At Risk
3,Female,40,Yes,20.0,No,No,No,27.24,78.0,At Risk
4,Male,58,Yes,20.0,No,No,No,23.55,78.0,Not at Risk


In [89]:
cleaned_df.to_csv('cleaned_test.csv', index=False)