# DATA CLEANING 

## Missing Values

In [2]:
import pandas as pd

## Example DataFrame

In [4]:
data = {'A': [1, 2, None, 4], 'B': [None, 2, 3, 4], 'C': [1, None, None, 4], 'D': [8, 0, 10, 3]}
df = pd.DataFrame(data)

## Original DataFrame

In [5]:
print("Original DataFrame:")
print(df)

Original DataFrame:
     A    B    C   D
0  1.0  NaN  1.0   8
1  2.0  2.0  NaN   0
2  NaN  3.0  NaN  10
3  4.0  4.0  4.0   3


## Dropping rows with any missing values

In [6]:
df_cleaned_rows = df.dropna()
print("\nDataFrame after dropping rows with any missing values:")
print(df_cleaned_rows)


DataFrame after dropping rows with any missing values:
     A    B    C  D
3  4.0  4.0  4.0  3


## Dropping columns with any missing values

In [8]:
df_cleaned_columns = df.dropna(axis=1) #axis = 0 is for rows
print("\nDataFrame after dropping columns with any missing values:")
print(df_cleaned_columns)


DataFrame after dropping columns with any missing values:
     A    B    C  D
3  4.0  4.0  4.0  3


## Filling missing values with a specific value (e.g., 0)

In [9]:
df_filled = df.fillna(0)
print("\nDataFrame after filling missing values with 0:")
print(df_filled)


DataFrame after filling missing values with 0:
     A    B    C   D
0  1.0  0.0  1.0   8
1  2.0  2.0  0.0   0
2  0.0  3.0  0.0  10
3  4.0  4.0  4.0   3


## Filling missing values with the mean of the column

In [20]:
df_filled_mean = df.fillna(df.mean())
print("\nDataFrame after filling missing values with the mean of the column:")
print(df_filled_mean)
#For categorical data use mode
#For nunerical data use all 3 methods 


DataFrame after filling missing values with the mean of the column:
          A    B    C   D
0  1.000000  3.0  1.0   8
1  2.000000  2.0  2.5   0
2  2.333333  3.0  2.5  10
3  4.000000  4.0  4.0   3


In [21]:
df.duplicated

<bound method DataFrame.duplicated of      A    B    C   D
0  1.0  NaN  1.0   8
1  2.0  2.0  NaN   0
2  NaN  3.0  NaN  10
3  4.0  4.0  4.0   3>

In [22]:
df.drop_duplicates

<bound method DataFrame.drop_duplicates of      A    B    C   D
0  1.0  NaN  1.0   8
1  2.0  2.0  NaN   0
2  NaN  3.0  NaN  10
3  4.0  4.0  4.0   3>

# OUTLIERS

Identifying Outliers Before handling outliers, you need to identify them. Common methods include:

Z-Score: Data points with a Z-score greater than 3 or less than -3 are often considered outliers.

IQR (Interquartile Range): Data points that fall below 𝑄 1 − 1.5 × 𝐼 𝑄 𝑅 Q1−1.5×IQR or above 𝑄 3 + 1.5 × 𝐼 𝑄 𝑅 Q3+1.5×IQR are considered outliers.

Visualization: Use box plots, scatter plots, or histograms to visually detect outliers.

In [24]:
import numpy as np
from scipy import stats

## Example DataFrame

In [25]:
data = {'A': [1, 2, 3, 1000], 'B': [2, 3, 4, 5]}
df = pd.DataFrame(data)

## Original DataFrame

In [26]:
print("Original DataFrame:")
print(df)

Original DataFrame:
      A  B
0     1  2
1     2  3
2     3  4
3  1000  5


## Detecting outliers using the Z-score

In [27]:
z_scores = np.abs(stats.zscore(df)) #The function np.abs() is used to get the absolute values of the Z-scores
print("\nZ-scores for each data point:")
print(z_scores)


Z-scores for each data point:
          A         B
0  0.579664  1.341641
1  0.577349  0.447214
2  0.575035  0.447214
3  1.732048  1.341641


In [28]:
df.describe()

Unnamed: 0,A,B
count,4.0,4.0
mean,251.5,3.5
std,499.000668,1.290994
min,1.0,2.0
25%,1.75,2.75
50%,2.5,3.5
75%,252.25,4.25
max,1000.0,5.0


## Removing outliers (where Z-score > 3)

In [29]:
df_no_outliers = df[(z_scores < 3).all(axis=1)]
print("\nDataFrame after removing outliers:")
print(df_no_outliers)


DataFrame after removing outliers:
      A  B
0     1  2
1     2  3
2     3  4
3  1000  5


# IQR

In [30]:
import pandas as pd

# Example DataFrame
data = {'A': [10, 12, 14, 16, 18, 20, 1000], 'B': [5, 6, 7, 8, 9, 10, 500]}
df = pd.DataFrame(data)

In [31]:
# Identify outliers using IQR
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

In [32]:
# Filtering out outliers
#lower bound Q1- 1.5 * IQR
#upper bound Q3 + 1.5 * IQR
df_no_outliers = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [33]:
# Original DataFrame
print("Original DataFrame:")
print(df)

# DataFrame after removing outliers
print("\nDataFrame after removing outliers:")
print(df_no_outliers)

Original DataFrame:
      A    B
0    10    5
1    12    6
2    14    7
3    16    8
4    18    9
5    20   10
6  1000  500

DataFrame after removing outliers:
    A   B
0  10   5
1  12   6
2  14   7
3  16   8
4  18   9
5  20  10


# Handling Inconsistent Data Types

In [1]:
import pandas as pd

# Example DataFrame
data = {'A': ['1', '2', 'three', '4'], 'B': ['1.1', '2.2', 'three', '4.4']}
df = pd.DataFrame(data)

In [2]:
# Original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
       A      B
0      1    1.1
1      2    2.2
2  three  three
3      4    4.4


In [8]:
# Converting to numeric, forcing invalid parsing to NaN
df['A'] = pd.to_numeric(df['A'], errors='coerce') #errors='coerce': Converts invalid parsing to NaN (Not a Number).
df['B'] = pd.to_numeric(df['B'], errors='coerce')
print("\nDataFrame after converting columns to numeric (forcing invalid to NaN):")
print(df)


DataFrame after converting columns to numeric (forcing invalid to NaN):
     A    B
0  1.0  1.1
1  2.0  2.2
2  NaN  NaN
3  4.0  4.4


In [9]:
# Dropping or filling rows where conversion resulted in NaN
df_cleaned = df.dropna()
print("\nDataFrame after dropping rows with NaN values:")
print(df_cleaned)


DataFrame after dropping rows with NaN values:
     A    B
0  1.0  1.1
1  2.0  2.2
3  4.0  4.4


# Dropping Irrelevant Columns

In [10]:
import pandas as pd

In [11]:
# Example DataFrame
data = {'A': [1, 2, 3, 4], 'B': [1, 2, 3, 4], 'C': ['NotNeeded1', 'NotNeeded2', 'NotNeeded3', 'NotNeeded4']}
df = pd.DataFrame(data)

In [12]:
# Original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
   A  B           C
0  1  1  NotNeeded1
1  2  2  NotNeeded2
2  3  3  NotNeeded3
3  4  4  NotNeeded4


In [13]:
# Dropping irrelevant columns
df_cleaned = df.drop(columns=['C'])
print("\nDataFrame after dropping irrelevant columns:")
print(df_cleaned)


DataFrame after dropping irrelevant columns:
   A  B
0  1  1
1  2  2
2  3  3
3  4  4


# Handling Inconsistent Data Formats

In [14]:
import pandas as pd

In [15]:
# Example DataFrame
data = {'Date': ['01/01/2020', '2020-02-01', 'March 3, 2020']}
df = pd.DataFrame(data)

In [16]:
# Original DataFrame
print("Original DataFrame:")
print(df)

Original DataFrame:
            Date
0     01/01/2020
1     2020-02-01
2  March 3, 2020


In [17]:
# Converting to a consistent date format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
print("\nDataFrame after converting to a consistent date format:")
print(df)


DataFrame after converting to a consistent date format:
        Date
0 2020-01-01
1 2020-02-01
2 2020-03-03


# Dropping one column

In [6]:
data = {'A': [1, 2, None, 4], 'B': [None, 2, 3, 4], 'C': [1, None, None, 4], 'D': [8, 0, 10, 3]}
df = pd.DataFrame(data)

In [7]:
df.drop('C', axis = 1)

Unnamed: 0,A,B,D
0,1.0,,8
1,2.0,2.0,0
2,,3.0,10
3,4.0,4.0,3


# Dropping multiple column

In [8]:
df.drop(['D','B'], axis = 1)

Unnamed: 0,A,C
0,1.0,1.0
1,2.0,
2,,
3,4.0,4.0


# Dropping one row

In [9]:
df.drop(2, axis = 0)

Unnamed: 0,A,B,C,D
0,1.0,,1.0,8
1,2.0,2.0,,0
3,4.0,4.0,4.0,3


# Dropping multiple rows

In [12]:
df.drop([2, 1, 0], axis = 0)

Unnamed: 0,A,B,C,D
3,4.0,4.0,4.0,3


# Rename the column

In [13]:
df.rename(columns = {'D':'Z'})

Unnamed: 0,A,B,C,Z
0,1.0,,1.0,8
1,2.0,2.0,,0
2,,3.0,,10
3,4.0,4.0,4.0,3


# Rename multiple column

In [14]:
df.rename(columns = {'D':'Z', 'A':'W', 'B':'X', 'C':'Y'})

Unnamed: 0,W,X,Y,Z
0,1.0,,1.0,8
1,2.0,2.0,,0
2,,3.0,,10
3,4.0,4.0,4.0,3
