In [115]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [169]:
data = pd.DataFrame({
     'age': [25, np.nan, 30, 35, np.nan],
     'salary': [50000, 60000, 70000, np.nan, 80000],
     'city': ['New York', 'Los Angeles', 'New York', 'San Francisco', 'Los Angeles'],
     'gender': ['Male', 'Female', 'Male', 'Female', 'Male']})
print(data)


    age   salary           city  gender
0  25.0  50000.0       New York    Male
1   NaN  60000.0    Los Angeles  Female
2  30.0  70000.0       New York    Male
3  35.0      NaN  San Francisco  Female
4   NaN  80000.0    Los Angeles    Male


In [195]:
data.isnull().sum()

age       2
salary    1
city      0
gender    0
dtype: int64

In [197]:
data.isnull().sum().sum()

3

In [199]:
data.dtypes

age       float64
salary    float64
city       object
gender     object
dtype: object

In [201]:
data.shape

(5, 4)

In [203]:
data.describe()

Unnamed: 0,age,salary
count,3.0,4.0
mean,30.0,65000.0
std,5.0,12909.944487
min,25.0,50000.0
25%,27.5,57500.0
50%,30.0,65000.0
75%,32.5,72500.0
max,35.0,80000.0


In [211]:
print("Show The Data:")
print(data.head())

Show The Data:
    age   salary           city  gender
0  25.0  50000.0       New York    Male
1   NaN  60000.0    Los Angeles  Female
2  30.0  70000.0       New York    Male
3  35.0      NaN  San Francisco  Female
4   NaN  80000.0    Los Angeles    Male


In [209]:
numerical_features = ['age', 'salary']
categorical_features = ['city', 'gender']

In [173]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

In [175]:
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

In [177]:
preprocessor = ColumnTransformer(
    transformers=[('num', numerical_pipeline, numerical_features),
                  ('cat', categorical_pipeline, categorical_features)])

In [179]:
processed_data = preprocessor.fit_transform(data)
print("Processed Data:\n", processed_data)

Processed Data:
 [[-1.58113883 -1.5         0.          1.          0.          0.
   1.        ]
 [ 0.         -0.5         1.          0.          0.          1.
   0.        ]
 [ 0.          0.5         0.          1.          0.          0.
   1.        ]
 [ 1.58113883  0.          0.          0.          1.          1.
   0.        ]
 [ 0.          1.5         1.          0.          0.          0.
   1.        ]]


In [181]:
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(processed_data)
print("Reduced Data:\n", reduced_data)

Reduced Data:
 [[ 2.33414772 -0.36986247]
 [-0.11631348 -0.70805625]
 [ 0.14801908  0.69740118]
 [-1.60103067 -1.15851186]
 [-0.76482265  1.5390294 ]]


In [183]:
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(processed_data)
print("Reduced Data:\n", reduced_data)

Reduced Data:
 [[ 2.33414772 -0.36986247]
 [-0.11631348 -0.70805625]
 [ 0.14801908  0.69740118]
 [-1.60103067 -1.15851186]
 [-0.76482265  1.5390294 ]]


In [185]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=2)),
    ('classifier', RandomForestClassifier())
])

In [187]:
target = np.array([0, 1, 0, 1, 0])

In [189]:
pipeline.fit(data, target)
print("Pipeline trained.")

Pipeline trained.
