# Task: Perform data cleaning, handling missing values, and normalization on the dataset.


In [3]:
 import pandas as pd
 import numpy as np
 from sklearn.preprocessing import MinMaxScaler


In [8]:
 # load Dataset
df = pd.read_csv(r'dataset.data')
 # 842302  M  17.99  10.38  122.8    1001   0.1184   0.2776  0.3001   0.1471 -> 

##### Handling Missing Values

In [9]:
 # 1. Checking for null values
 df.isnull().sum()
 # 2. Droping rows with null values
 df.dropna()
 # 3. Fill missing values with mean/median/mode/global constant
 df['M'].fillna(df['M'].mode())

0      M
1      M
2      M
3      M
4      M
      ..
563    M
564    M
565    M
566    M
567    B
Name: M, Length: 568, dtype: object

######  Data Cleaning (Handling missing values also comes under Data Cleaning)

In [10]:
 from scipy.stats import zscore
 # 1. Remove duplicate values
 df.drop_duplicates()
 # 2. Outliers Detection and removing
 df = df[zscore(df['17.99']).abs() <=3]
 # 3. Variable Encoding | Handling Categorical data 
pd.get_dummies(df)
 df["M"] = df["M"].apply(lambda x : 1 if x == "M" else 0)
 df

Unnamed: 0,842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
0,842517,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
1,84300903,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
2,84348301,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
3,84358402,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
4,843786,1,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,...,15.470,23.75,103.40,741.6,0.17910,0.52490,0.5355,0.1741,0.3985,0.12440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
563,926424,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
564,926682,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
565,926954,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
566,927241,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


###### Normalization

In [13]:
from sklearn.preprocessing import MinMaxScaler
normal = MinMaxScaler().fit_transform(df)
df = pd.DataFrame(normal,columns=df.columns)
df
 

Unnamed: 0,842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
0,0.000915,1.0,0.769959,0.272574,0.727963,0.696613,0.289880,0.181768,0.203608,0.348757,...,0.694625,0.303571,0.662571,0.545399,0.347553,0.154563,0.192971,0.639175,0.233590,0.222878
1,0.092495,1.0,0.720097,0.390260,0.704273,0.624153,0.514309,0.431017,0.462512,0.635686,...,0.636808,0.360075,0.624060,0.469324,0.483590,0.385375,0.359744,0.835052,0.403706,0.213433
2,0.092547,1.0,0.251516,0.360839,0.276040,0.142916,0.811321,0.811361,0.565604,0.522863,...,0.284202,0.385928,0.296228,0.117808,0.915472,0.814012,0.548642,0.884880,1.000000,0.773711
3,0.092559,1.0,0.754094,0.156578,0.745936,0.679529,0.430351,0.347893,0.463918,0.518390,...,0.594870,0.123934,0.622226,0.428052,0.437364,0.172415,0.319489,0.558419,0.157500,0.142595
4,0.000916,1.0,0.309876,0.202570,0.316804,0.196524,0.678613,0.461996,0.369728,0.402038,...,0.307003,0.312633,0.323920,0.171369,0.712739,0.482784,0.427716,0.598282,0.477035,0.454939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,0.001007,1.0,0.826052,0.428813,0.802304,0.786745,0.526948,0.296055,0.571462,0.690358,...,0.713355,0.383262,0.707195,0.567266,0.461137,0.178527,0.328035,0.761512,0.097575,0.105667
559,0.001007,1.0,0.745028,0.626987,0.714076,0.658321,0.407782,0.257714,0.337395,0.486630,...,0.641694,0.699094,0.639342,0.476100,0.300007,0.159997,0.256789,0.559450,0.198502,0.074315
560,0.001008,1.0,0.545017,0.621238,0.526999,0.420972,0.288165,0.254340,0.216753,0.263519,...,0.449919,0.589019,0.466349,0.289146,0.282177,0.273705,0.271805,0.487285,0.128721,0.151909
561,0.001008,1.0,0.771658,0.663510,0.786782,0.660677,0.588336,0.790197,0.823336,0.755467,...,0.725163,0.730277,0.820282,0.503819,0.619626,0.815758,0.749760,0.910653,0.497142,0.452315
