<a href="https://colab.research.google.com/github/usshaa/BK_Birla/blob/main/06-Data%20Preprocessing%20and%20Feature%20Engineering/Data_Preprocessing_Titanic_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing (Pandas & Scikit-learn)
- **What is Data Preprocessing?** Importance in ML pipeline
- **Data Cleaning:** Handling missing values
- **Feature Scaling:** Normalization vs Standardization
- **Feature Engineering:** Encoding categorical variables, feature selection

In [499]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# 1. Data Loading

In [500]:
import pandas as pd
# Load the dataset directly from the Kaggle link (replace with the actual link)
data = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 2. Data Cleaning

In [501]:
# Handle Duplicates
data.drop_duplicates(inplace=True)

In [502]:
# Handling Missing Values
# Use SimpleImputer to fill missing values with the mean (you can change the strategy)
numerical_cols = data.select_dtypes(include=np.number).columns  # Identify numerical columns
categorical_cols = data.select_dtypes(exclude=np.number).columns # Identify categorical columns

In [503]:
imputer_num = SimpleImputer(strategy='mean') # use mean for numerical columns
imputer_cat = SimpleImputer(strategy='most_frequent') # use most frequent for categorical columns

In [504]:
data[numerical_cols] = imputer_num.fit_transform(data[numerical_cols])
data[categorical_cols] = imputer_cat.fit_transform(data[categorical_cols])
data[numerical_cols]
data[categorical_cols]

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,B96 B98,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,B96 B98,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,B96 B98,S
...,...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,B96 B98,S
887,"Graham, Miss. Margaret Edith",female,112053,B42,S
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,B96 B98,S
889,"Behr, Mr. Karl Howell",male,111369,C148,C


In [505]:
data.count()

Unnamed: 0,0
PassengerId,891
Survived,891
Pclass,891
Name,891
Sex,891
Age,891
SibSp,891
Parch,891
Ticket,891
Fare,891


In [506]:
from scipy import stats

z_scores = stats.zscore(data[numerical_cols])
data = data[(abs(z_scores) < 3).all(axis=1)]  # Keeping only values within 3 standard deviations

In [507]:
# new_data = data.dropna()
data.count()

Unnamed: 0,0
PassengerId,820
Survived,820
Pclass,820
Name,820
Sex,820
Age,820
SibSp,820
Parch,820
Ticket,820
Fare,820


In [508]:
# 3. Feature Scaling

In [509]:
# Normalization (Min-Max Scaling)
scaler_minmax = MinMaxScaler()
data[numerical_cols] = scaler_minmax.fit_transform(data[numerical_cols])

In [510]:
# Standardization (Z-score normalization)
scaler_standard = StandardScaler()
data[numerical_cols] = scaler_standard.fit_transform(data[numerical_cols])

In [511]:
# 4. Feature Engineering

In [512]:
# # Encoding categorical variables (One-hot encoding)
# encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # Handle unknown categories
# encoded_features = encoder.fit_transform(data[['Sex','Embarked']])
# encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Sex','Embarked'])) # Provide feature names directly
# data = data.drop(['Sex', 'Embarked'], axis=1) # Drop original columns after encoding
# data = pd.concat([data, encoded_df], axis=1)
# data

In [513]:
# LabeEncoder for Sex and Embarked column

# Encoding categorical variables (Label encoding)
encoder = LabelEncoder()
data['Sex'] = encoder.fit_transform(data['Sex'])
data['Embarked'] = encoder.fit_transform(data['Embarked'])
data


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,-1.743605,-0.80,0.833995,"Braund, Mr. Owen Harris",1,-0.641702,1.051956,-0.451141,A/5 21171,-0.623707,B96 B98,2
1,-1.739685,1.25,-1.586804,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,0.675821,1.051956,-0.451141,PC 17599,1.525109,C85,0
2,-1.735764,1.25,0.833995,"Heikkinen, Miss. Laina",0,-0.312321,-0.563407,-0.451141,STON/O2. 3101282,-0.601055,B96 B98,2
3,-1.731843,1.25,-1.586804,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,0.428785,1.051956,-0.451141,113803,0.914918,C123,2
4,-1.727923,-0.80,0.833995,"Allen, Mr. William Henry",1,0.428785,-0.563407,-0.451141,373450,-0.596861,B96 B98,2
...,...,...,...,...,...,...,...,...,...,...,...,...
886,1.730122,-0.80,-0.376405,"Montvila, Rev. Juozas",1,-0.229976,-0.563407,-0.451141,211536,-0.430749,B96 B98,2
887,1.734043,1.25,-1.586804,"Graham, Miss. Margaret Edith",0,-0.888737,-0.563407,-0.451141,112053,0.139733,B42,2
888,1.737964,-0.80,0.833995,"Johnston, Miss. Catherine Helen ""Carrie""",0,-0.007717,1.051956,3.088912,W./C. 6607,-0.080071,B96 B98,2
889,1.741884,1.25,-1.586804,"Behr, Mr. Karl Howell",1,-0.312321,-0.563407,-0.451141,111369,0.139733,C148,0


In [514]:
# select input feature and target data

# Define features (X) and target (y)
X = data.drop(['Survived','Name','Ticket','Cabin','PassengerId','SibSp','Parch'], axis=1)  # Features (all columns except 'Survived')
y = data['Survived']  # Target variable ('Survived')


In [515]:
#Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train


Unnamed: 0,Pclass,Sex,Age,Fare,Embarked
194,-1.586804,0,1.169891,0.063248,0
269,-1.586804,0,0.428785,3.684553,2
490,0.833995,1,-0.007717,-0.196962,2
390,-1.586804,1,0.511130,3.159934,2
538,0.833995,1,-0.007717,-0.380413,2
...,...,...,...,...,...
79,0.833995,0,0.017059,-0.448367,2
120,-0.376405,1,-0.724047,1.599497,2
296,0.833995,1,-0.518184,-0.624405,0
471,0.833995,1,0.675821,-0.576306,2
