# Functions for Cleaning DataFrames

In [61]:
import pandas as pd
import numpy as np

pd.set_option('mode.chained_assignment',None)

In [62]:
df = pd.DataFrame({
    "Name":["Anna Smith","John Doe","Juan Cruz","Amy Sanchez","Anna Smith"],
    "Department":["Human Resources","Sales","Finance","Business Intelligence","Human Resources"],
    "Employee Number":[1335,1532,1234,1664,1335],
    "Email":["anna.smith@data.com","john.doe@data.com","juan.cruz@data.com","amy.sanchez@data.com","anna.smith@data.com"]
})

df

Unnamed: 0,Name,Department,Employee Number,Email
0,Anna Smith,Human Resources,1335,anna.smith@data.com
1,John Doe,Sales,1532,john.doe@data.com
2,Juan Cruz,Finance,1234,juan.cruz@data.com
3,Amy Sanchez,Business Intelligence,1664,amy.sanchez@data.com
4,Anna Smith,Human Resources,1335,anna.smith@data.com


## .str.replace()

In [63]:
## This function replaces all occurences of the text to replace with something else.
## This function is applied to all the rows in the column.

df["Email"] = df["Email"].str.replace("@","+") ## This replaces all the '@' symbol in the email column with '+'

df

Unnamed: 0,Name,Department,Employee Number,Email
0,Anna Smith,Human Resources,1335,anna.smith+data.com
1,John Doe,Sales,1532,john.doe+data.com
2,Juan Cruz,Finance,1234,juan.cruz+data.com
3,Amy Sanchez,Business Intelligence,1664,amy.sanchez+data.com
4,Anna Smith,Human Resources,1335,anna.smith+data.com


## .astype()

In [64]:
## The Employee Number column is considered an integer column by python.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Name             5 non-null      object
 1   Department       5 non-null      object
 2   Employee Number  5 non-null      int64 
 3   Email            5 non-null      object
dtypes: int64(1), object(3)
memory usage: 288.0+ bytes


In [65]:
## This function converts the type of the column into something else.
## This function is applied to all the rows in the column.

df["Employee Number"] = df["Employee Number"].astype("str")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Name             5 non-null      object
 1   Department       5 non-null      object
 2   Employee Number  5 non-null      object
 3   Email            5 non-null      object
dtypes: object(4)
memory usage: 288.0+ bytes


## .drop_duplicates()

In [66]:
## This function drops records that are identical with another record.
## This function is applied to all the rows in the column.

df = df.drop_duplicates()

df

Unnamed: 0,Name,Department,Employee Number,Email
0,Anna Smith,Human Resources,1335,anna.smith+data.com
1,John Doe,Sales,1532,john.doe+data.com
2,Juan Cruz,Finance,1234,juan.cruz+data.com
3,Amy Sanchez,Business Intelligence,1664,amy.sanchez+data.com


## .str.lower(), .str.upper(), .str.title()

In [67]:
## This changes strings to lowercase.
## This function is applied to all the rows in the column.

df["Department"] = df["Department"].str.lower()

df

Unnamed: 0,Name,Department,Employee Number,Email
0,Anna Smith,human resources,1335,anna.smith+data.com
1,John Doe,sales,1532,john.doe+data.com
2,Juan Cruz,finance,1234,juan.cruz+data.com
3,Amy Sanchez,business intelligence,1664,amy.sanchez+data.com


In [68]:
## This changes strings to lowercase.
## This function is applied to all the rows in the column.

df.loc[:,"Department"] = df.loc[:,"Department"].str.lower()

df

Unnamed: 0,Name,Department,Employee Number,Email
0,Anna Smith,human resources,1335,anna.smith+data.com
1,John Doe,sales,1532,john.doe+data.com
2,Juan Cruz,finance,1234,juan.cruz+data.com
3,Amy Sanchez,business intelligence,1664,amy.sanchez+data.com


In [69]:
## This changes strings to uppercase.
## This function is applied to all the rows in the column.

df.loc[:,"Department"] = df.loc[:,"Department"].str.upper()

df

Unnamed: 0,Name,Department,Employee Number,Email
0,Anna Smith,HUMAN RESOURCES,1335,anna.smith+data.com
1,John Doe,SALES,1532,john.doe+data.com
2,Juan Cruz,FINANCE,1234,juan.cruz+data.com
3,Amy Sanchez,BUSINESS INTELLIGENCE,1664,amy.sanchez+data.com


In [70]:
## This capitalizes the first letter of each word in the column.
## This function is applied to all the rows in the column.

df.loc[:,"Department"] = df.loc[:,"Department"].str.title()

df

Unnamed: 0,Name,Department,Employee Number,Email
0,Anna Smith,Human Resources,1335,anna.smith+data.com
1,John Doe,Sales,1532,john.doe+data.com
2,Juan Cruz,Finance,1234,juan.cruz+data.com
3,Amy Sanchez,Business Intelligence,1664,amy.sanchez+data.com
