# Steps for Data Preprocessing

* Step 1 :Import important libraries
* Step 2: Import dataset
* Step 3: Preprocessing: Find duplicates, Missing value treatment, Encoding, Handling data types, Outlier treatment, Feature scaling, Data balancing

In [1]:
## import important Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Import Important Libraries
Purpose of Libraries
 * os: Functions to interact with the operating system.
 * Example Usage: os.listdir() lists files and directories in the specified path.
 * numpy: Support for arrays, matrices, and mathematical functions.
 * pandas: Data manipulation and analysis.
 * matplotlib & seaborn: Data visualization.
 * sns.set(): Automatically sets the seaborn plot aesthetics to a default 
 * warnings: Manage warning messages in code theme.
 * %matplotlib inline: A magic command used in Jupyter notebooks to display matplotlib plots inline within the notebook.

In [3]:
dataset = pd.read_csv(r'Datasets\titanic3.csv')
dataset.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
print(dataset.duplicated().sum()) # there are zero duplicates in this dataset

0


In [7]:
dataset.shape

(1310, 14)

In [12]:
dataset.drop_duplicates(keep='first', inplace=True)

In [13]:
dataset.shape

(1310, 14)

### Handling missing values
- For categorical variable fill it with mode
- For numerical fill it with mean or median
- Depending upon no. of missing values you drop the entire feature itself (feature which has 40% missing values)
- If dataset contains less number of missing values drop entitre row

In [14]:
dataset.isnull().sum()

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

In [18]:
columns_to_be_removed = []
for column in dataset.columns:
    percentage = dataset[column].isnull().sum()/len(dataset) * 100
    if percentage > 40:
        columns_to_be_removed.append(column)
    print(f"{column} has {percentage} percentage of missing values\n")
print(columns_to_be_removed)

pclass has 0.07633587786259542 percentage of missing values

survived has 0.07633587786259542 percentage of missing values

name has 0.07633587786259542 percentage of missing values

sex has 0.07633587786259542 percentage of missing values

age has 20.15267175572519 percentage of missing values

sibsp has 0.07633587786259542 percentage of missing values

parch has 0.07633587786259542 percentage of missing values

ticket has 0.07633587786259542 percentage of missing values

fare has 0.15267175572519084 percentage of missing values

cabin has 77.48091603053436 percentage of missing values

embarked has 0.22900763358778628 percentage of missing values

boat has 62.90076335877862 percentage of missing values

body has 90.76335877862596 percentage of missing values

home.dest has 43.12977099236641 percentage of missing values

['cabin', 'boat', 'body', 'home.dest']


In [20]:
dataset.drop(columns=columns_to_be_removed, axis=1, inplace= True)
dataset.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,S
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,S
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,S
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,S
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,S


In [21]:
dataset.isnull().sum()

pclass        1
survived      1
name          1
sex           1
age         264
sibsp         1
parch         1
ticket        1
fare          2
embarked      3
dtype: int64

In [22]:
## In this 'pclass', 'survived', 'name', 'sex', 'embarked' are categorical features use mode
## 'sibsp', 'parch', 'fare' are numerical features use mean and ticket feature can be dropped as it is not required

categorical_features = ['pclass', 'survived', 'name', 'sex']
numerical_features = ['sibsp', 'parch', 'fare']
categorical_sum = 0
numerical_sum = 0
for feature in categorical_features:
    dataset[feature] = dataset[feature].fillna(dataset[feature].mode()[0])
    categorical_sum += 1
for feature in numerical_features:
    dataset[feature] = dataset[feature].fillna(dataset[feature].mean())
    numerical_sum += 1
categorical_sum, numerical_sum

(4, 3)

In [23]:
dataset.isnull().sum()

pclass        0
survived      0
name          0
sex           0
age         264
sibsp         0
parch         0
ticket        1
fare          0
embarked      3
dtype: int64