# Cleaning Practice
Let's first practice handling missing values and duplicate data using the `cancer_data_means.csv` file.

In [15]:
#import libraries and assign cancer_data_means.csv to a variable 
import pandas as pd
cancer_data = pd.read_csv("cancer_data_means.csv")

In [16]:
# check which columns have missing values
missing = cancer_data.isnull().sum()
print("Columns with missing values:")
print(missing[missing > 0])



Columns with missing values:
texture_mean       21
smoothness_mean    48
symmetry_mean      65
dtype: int64


In [17]:
# confirm your correction
# Exclude non-numeric columns from the conversion process
numeric_columns = cancer_data.select_dtypes(include=['number']).columns
cancer_data[numeric_columns] = cancer_data[numeric_columns].fillna(cancer_data[numeric_columns].mean())
cancer_data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,19.293431,122.80,1001.0,0.118400,0.27760,0.30010,0.14710,0.2419,0.07871
1,842517,M,20.57,17.770000,132.90,1326.0,0.084740,0.07864,0.08690,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.250000,130.00,1203.0,0.109600,0.15990,0.19740,0.12790,0.2069,0.05999
3,84348301,M,11.42,20.380000,77.58,386.1,0.096087,0.28390,0.24140,0.10520,0.2597,0.09744
4,84358402,M,20.29,14.340000,135.10,1297.0,0.100300,0.13280,0.19800,0.10430,0.1809,0.05883
...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.390000,142.00,1479.0,0.111000,0.11590,0.24390,0.13890,0.1726,0.05623
565,926682,M,20.13,28.250000,131.20,1261.0,0.097800,0.10340,0.14400,0.09791,0.1752,0.05533
566,926954,M,16.60,28.080000,108.30,858.1,0.084550,0.10230,0.09251,0.05302,0.1590,0.05648
567,927241,M,20.60,29.330000,140.10,1265.0,0.117800,0.27700,0.35140,0.15200,0.2397,0.07016


In [24]:
# how many duplicates are there ?
duplicates_count = cancer_data.duplicated().sum()
print(duplicates_count)

5


In [25]:
# drop duplicates
cancer_data_cleaned = cancer_data.drop_duplicates()
cancer_data_cleaned

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean
0,842302,M,17.99,19.293431,122.80,1001.0,0.118400,0.27760,0.30010,0.14710,0.2419,0.07871
1,842517,M,20.57,17.770000,132.90,1326.0,0.084740,0.07864,0.08690,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.250000,130.00,1203.0,0.109600,0.15990,0.19740,0.12790,0.2069,0.05999
3,84348301,M,11.42,20.380000,77.58,386.1,0.096087,0.28390,0.24140,0.10520,0.2597,0.09744
4,84358402,M,20.29,14.340000,135.10,1297.0,0.100300,0.13280,0.19800,0.10430,0.1809,0.05883
...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.390000,142.00,1479.0,0.111000,0.11590,0.24390,0.13890,0.1726,0.05623
565,926682,M,20.13,28.250000,131.20,1261.0,0.097800,0.10340,0.14400,0.09791,0.1752,0.05533
566,926954,M,16.60,28.080000,108.30,858.1,0.084550,0.10230,0.09251,0.05302,0.1590,0.05648
567,927241,M,20.60,29.330000,140.10,1265.0,0.117800,0.27700,0.35140,0.15200,0.2397,0.07016


In [26]:
# confirm correction by rechecking for duplicates in the data
duplicates_count_after = cancer_data_cleaned.duplicated().sum()
print(duplicates_count_after)

0


## Renaming Columns
Since we also previously changed our dataset to only include means of tumor features, the "_mean" at the end of each feature seems unnecessary. It just takes extra time to type in our analysis later. Rename the columns of the dataframe to remove "_mean".

In [28]:
# rename the columns of the dataframe (remove _mean from the name of each column if found)
cancer_data_renamed = cancer_data.rename(columns=lambda x: x.replace('_mean', '') if '_mean' in x else x)
cancer_data_renamed

Unnamed: 0,id,diagnosis,radius,texture,perimeter,area,smoothness,compactness,concavity,concave_points,symmetry,fractal_dimension
0,842302,M,17.99,19.293431,122.80,1001.0,0.118400,0.27760,0.30010,0.14710,0.2419,0.07871
1,842517,M,20.57,17.770000,132.90,1326.0,0.084740,0.07864,0.08690,0.07017,0.1812,0.05667
2,84300903,M,19.69,21.250000,130.00,1203.0,0.109600,0.15990,0.19740,0.12790,0.2069,0.05999
3,84348301,M,11.42,20.380000,77.58,386.1,0.096087,0.28390,0.24140,0.10520,0.2597,0.09744
4,84358402,M,20.29,14.340000,135.10,1297.0,0.100300,0.13280,0.19800,0.10430,0.1809,0.05883
...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.390000,142.00,1479.0,0.111000,0.11590,0.24390,0.13890,0.1726,0.05623
565,926682,M,20.13,28.250000,131.20,1261.0,0.097800,0.10340,0.14400,0.09791,0.1752,0.05533
566,926954,M,16.60,28.080000,108.30,858.1,0.084550,0.10230,0.09251,0.05302,0.1590,0.05648
567,927241,M,20.60,29.330000,140.10,1265.0,0.117800,0.27700,0.35140,0.15200,0.2397,0.07016


In [29]:
# display first few rows of the dataframe to confirm changes
print(cancer_data_renamed.head())

         id diagnosis  radius    texture  perimeter    area  smoothness  \
0    842302         M   17.99  19.293431     122.80  1001.0    0.118400   
1    842517         M   20.57  17.770000     132.90  1326.0    0.084740   
2  84300903         M   19.69  21.250000     130.00  1203.0    0.109600   
3  84348301         M   11.42  20.380000      77.58   386.1    0.096087   
4  84358402         M   20.29  14.340000     135.10  1297.0    0.100300   

   compactness  concavity  concave_points  symmetry  fractal_dimension  
0      0.27760     0.3001         0.14710    0.2419            0.07871  
1      0.07864     0.0869         0.07017    0.1812            0.05667  
2      0.15990     0.1974         0.12790    0.2069            0.05999  
3      0.28390     0.2414         0.10520    0.2597            0.09744  
4      0.13280     0.1980         0.10430    0.1809            0.05883  


In [30]:
# save this for later as a csv file named "cancer_data_edited.csv" and set indexing to false (why ??)
cancer_data_renamed.to_csv("cancer_data_edited.csv")