# Data Preprocessing and feature engineering

In [1]:
import pandas as pd
url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
data = pd.read_csv(url)
print(data.columns)
print(data.head())


Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard',
       'Parents/Children Aboard', 'Fare'],
      dtype='object')
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0            

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Step 1: Handle missing values

In [3]:
#Separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=['number']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()

In [4]:
if 'Survived' in numerical_cols:
    numerical_cols.remove('Survived')
if 'Survived' in categorical_cols:
    categorical_cols.remove('Survived')


In [5]:
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

In [6]:
data[numerical_cols] = numerical_imputer.fit_transform(data[numerical_cols])
data[categorical_cols] = categorical_imputer.fit_transform(data[categorical_cols])

Step 2: Encoding categorical variables

In [7]:
onehot = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity

In [8]:
# Create a ColumnTransformer to apply transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Scale numerical columns
        ('cat', onehot, categorical_cols)  # One-hot encode categorical columns
    ]
)

In [9]:
X = data.drop('Survived', axis=1)  # 'Survived' is the target variable in the Titanic dataset
y = data['Survived']

In [10]:
X_preprocessed = preprocessor.fit_transform(X)



Step 3: Feature Engineering

In [11]:
X['NewFeature'] = data['Age'] + data['Fare']
X['NewFeature'] = StandardScaler().fit_transform(X[['NewFeature']])

In [12]:
X_preprocessed = np.hstack([X_preprocessed, X[['NewFeature']].values])

Step 4: Split the data into training and testing sets

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

In [14]:
# Now, X_train and X_test are ready to be used for training and testing your machine learning model

print("Preprocessing and feature engineering complete.")

Preprocessing and feature engineering complete.


In [15]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (709, 893)
Shape of X_test: (178, 893)
Shape of y_train: (709,)
Shape of y_test: (178,)
