In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin'], inplace=True)

Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
df.loc[df['Fare'] > upper_bound, 'Fare'] = upper_bound

df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,65.6344,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [4]:
#Exercise 1— Encoding Categorical Data
df_encoded = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
df_encoded.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,True,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,65.6344,False,False,False
2,3,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,False,False,True
4,5,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,True,False,True


In [5]:
le = LabelEncoder()
df_encoded['Pclass'] = le.fit_transform(df_encoded['Pclass'])
print(df_encoded['Pclass'].value_counts())
df_encoded.head()

Pclass
2    491
0    216
1    184
Name: count, dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,2,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,True,False,True
1,2,1,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,65.6344,False,False,False
2,3,1,2,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,False,False,True
3,4,1,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,False,False,True
4,5,0,2,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,True,False,True


In [6]:
#Exercise 2— Scaling Numerical Data
scaler_std = StandardScaler()
scaled_std = scaler_std.fit_transform(df_encoded[['Age','Fare']])
df_std = pd.DataFrame(scaled_std, columns=['Age_std','Fare_std'])
df_std.describe()

Unnamed: 0,Age_std,Fare_std
count,891.0,891.0
mean,2.27278e-16,9.968332e-17
std,1.000562,1.000562
min,-2.224156,-1.174727
25%,-0.5657365,-0.7882908
50%,-0.1046374,-0.4686152
75%,0.4333115,0.3396748
max,3.891554,2.031623


In [7]:
scaler_mm = MinMaxScaler()
scaled_mm = scaler_mm.fit_transform(df_encoded[['Age','Fare']])
df_mm = pd.DataFrame(scaled_mm, columns=['Age_norm','Fare_norm'])
df_mm.describe()

Unnamed: 0,Age_norm,Fare_norm
count,891.0,891.0
mean,0.363679,0.366375
std,0.163605,0.312056
min,0.0,0.0
25%,0.271174,0.120522
50%,0.346569,0.220223
75%,0.434531,0.472313
max,1.0,1.0


1. neighborhood– One-Hot Encoding, because this is a nominal categorical variable with no natural order.
2. num_rooms– StandardScaler or MinMaxScaler, because this is a numerical variable and can be standardized/normalized for machine learning.
3. building_condition– Label Encoding, because there is a natural order (Poor < Average < Good < Excellent).
4. price_amd– StandardScaler or MinMaxScaler, to bring the numerical values to a comparable scale.