# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file.

In [2]:
# import pandas  
import pandas as pd 

In [3]:
# load cancer data
df=pd.read_csv("cancer_data_means.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,84348301,M,11.42,20.38,77.58,386.1,,0.2839,0.2414,0.1052,0.2597,0.09744
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      569 non-null    int64  
 1   diagnosis               569 non-null    object 
 2   radius_mean             569 non-null    float64
 3   texture_mean            548 non-null    float64
 4   perimeter_mean          569 non-null    float64
 5   area_mean               569 non-null    float64
 6   smoothness_mean         521 non-null    float64
 7   compactness_mean        569 non-null    float64
 8   concavity_mean          569 non-null    float64
 9   concave_points_mean     569 non-null    float64
 10  symmetry_mean           504 non-null    float64
 11  fractal_dimension_mean  569 non-null    float64
dtypes: float64(10), int64(1), object(1)
memory usage: 53.5+ KB


In [11]:
# check which columns have missing values
missing_values = df.isna().sum()
missing_values

id                         0
diagnosis                  0
radius_mean                0
texture_mean              21
perimeter_mean             0
area_mean                  0
smoothness_mean           48
compactness_mean           0
concavity_mean             0
concave_points_mean        0
symmetry_mean             65
fractal_dimension_mean     0
dtype: int64

In [14]:
is_missing=missing_values > 0

id                        False
diagnosis                 False
radius_mean               False
texture_mean               True
perimeter_mean            False
area_mean                 False
smoothness_mean            True
compactness_mean          False
concavity_mean            False
concave_points_mean       False
symmetry_mean              True
fractal_dimension_mean    False
dtype: bool

In [13]:
missing_cols = missing_values[is_missing].index.tolist()
missing_cols

['texture_mean', 'smoothness_mean', 'symmetry_mean']

In [16]:
# use the mean to fill in missing values
values = {
    "texture_mean": df["texture_mean"].mean(),
    "smoothness_mean": df["smoothness_mean"].mean(),
    "symmetry_mean": df["symmetry_mean"].mean(),
}

# confirm your correction 
df.fillna(value=values,inplace=True)
df.isnull().sum()

id                        0
diagnosis                 0
radius_mean               0
texture_mean              0
perimeter_mean            0
area_mean                 0
smoothness_mean           0
compactness_mean          0
concavity_mean            0
concave_points_mean       0
symmetry_mean             0
fractal_dimension_mean    0
dtype: int64

In [19]:
# how many duplicates are there ?
df.duplicated().sum()

5

In [20]:
# drop duplicates
df.drop_duplicates(inplace=True)

In [21]:
# confirm correction by rechecking for duplicates in the data
df.duplicated().sum()

0

## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Rename the columns of the dataframe to remove "_mean".

In [22]:
# rename the columns of the dataframe (remove _mean from the name of each column if found)
df.rename(columns=lambda x: x.replace('_mean', ''), inplace=True)

In [24]:
# display first few rows of the dataframe to confirm changes
df.columns.tolist()

['id',
 'diagnosis',
 'radius',
 'texture',
 'perimeter',
 'area',
 'smoothness',
 'compactness',
 'concavity',
 'concave_points',
 'symmetry',
 'fractal_dimension']

In [26]:
# save this for later as a csv file named "cancer_data_edited.csv" and set indexing to false (why ??)
# This will exclude the index column from the saved CSV file.
df.to_csv('cancer_data_edited.csv', index=False)