!['ml_map.png'](ml_map.png)

#### Sklearn
- Build machine learning models to make predictions or learn patterns within the data and evaluate those predictions 
- It has many in-built ML models

#### Workflow:
1. Get data ready
2. Pick a model (to suit the problem)
3. Fit the model to the data and make a prediction
4. Evaluate the model
5. Improve through experimentations
6. Save and reload your trained model

In [44]:
import numpy as np
import pandas as pd
heart_disease = pd.read_csv('heart-disease.csv')

In [45]:
heart_disease.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
186,60,1,0,130,253,0,1,144,1,1.4,2,1,3,0
101,59,1,3,178,270,0,0,145,0,4.2,0,0,3,1
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2,1
129,74,0,1,120,269,0,0,121,1,0.2,2,1,2,1
106,69,1,3,160,234,1,0,131,0,0.1,1,1,2,1


In [46]:
heart_disease.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [47]:
# Create X (feature matrix)
X = heart_disease.drop('target',axis=1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [48]:
# Create y (labels)
y = heart_disease['target']
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [49]:
# Choose the right model and hyper-parameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# Keep the default parameters here
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

##### Fit the model to the training data

In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2) # test_size=0.2 => 80% data is used for training, 20% for testing

In [51]:
clf.fit(X_train,y_train);

In [52]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
187,54,1,0,124,266,0,0,109,1,2.2,1,1,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
35,46,0,2,142,177,0,0,160,1,1.4,0,0,2
70,54,1,2,120,258,0,0,147,0,0.4,1,0,3
171,48,1,1,110,229,0,1,168,0,1.0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,58,1,0,114,318,0,2,140,0,4.4,0,3,1
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
216,62,0,2,130,263,0,1,97,0,1.2,1,1,3


##### Make predictions

In [53]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
220,63,0,0,150,407,0,0,154,0,4.0,1,3,3
194,60,1,2,140,185,0,0,155,0,3.0,1,0,2
30,41,0,1,105,198,0,1,168,0,0.0,2,1,2
295,63,1,0,140,187,0,0,144,1,4.0,2,2,3
294,44,1,0,120,169,0,1,144,1,2.8,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,74,0,1,120,269,0,0,121,1,0.2,2,1,2
14,58,0,3,150,283,1,0,162,0,1.0,2,0,2
149,42,1,2,130,180,0,1,150,0,0.0,2,0,2
213,61,0,0,145,307,0,0,146,1,1.0,1,0,3


In [65]:
y_preds = clf.predict(X_test) # prediction can be done on data which looks similar  shpae to train so X_test

In [55]:
y_preds

array([0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0], dtype=int64)

In [56]:
y_test

220    0
194    0
30     1
295    0
294    0
      ..
129    1
14     1
149    1
213    0
120    1
Name: target, Length: 61, dtype: int64

##### Evaluate the model

In [57]:
clf.score(X_train, y_train) # Result on training data

1.0

In [58]:
clf.score(X_test,y_test) # Result on test data

0.7868852459016393

In [59]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.78      0.75      0.76        28
           1       0.79      0.82      0.81        33

    accuracy                           0.79        61
   macro avg       0.79      0.78      0.78        61
weighted avg       0.79      0.79      0.79        61



In [60]:
confusion_matrix(y_test,y_preds)

array([[21,  7],
       [ 6, 27]], dtype=int64)

In [61]:
accuracy_score(y_test, y_preds)

0.7868852459016393

##### Improve a model

In [62]:
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f'Trying model with {i} estimators...')
    clf = RandomForestClassifier(n_estimators=i).fit(X_train,y_train)
    print(f'Model accuracy on test data set:{clf.score(X_test,y_test)*100:2f}%')
    print("")

Trying model with 10 estimators...
Model accuracy on test data set:77.049180%

Trying model with 20 estimators...
Model accuracy on test data set:81.967213%

Trying model with 30 estimators...
Model accuracy on test data set:77.049180%

Trying model with 40 estimators...
Model accuracy on test data set:77.049180%

Trying model with 50 estimators...
Model accuracy on test data set:77.049180%

Trying model with 60 estimators...
Model accuracy on test data set:77.049180%

Trying model with 70 estimators...
Model accuracy on test data set:81.967213%

Trying model with 80 estimators...
Model accuracy on test data set:78.688525%

Trying model with 90 estimators...
Model accuracy on test data set:80.327869%



##### Save a model and load it

In [63]:
import pickle
pickle.dump(clf,open('random_forest_model1.pkl','wb'))

In [64]:
loaded_model = pickle.load(open('random_forest_model1.pkl','rb'))
loaded_model.score(X_test,y_test)

0.8032786885245902

### Estimators
- Object that <b>can learn from data</b>
- Has <b>fit</b> method
- Types of estimators : Predictors and Transformers

##### Predictors : 
- <b>Can do prediction on new data</b> after learning from data
- Has <b>predict</b> method
- Example : decision tree, logistic regression etc.
##### Transformers : 
- Can <b>apply some transformation</b> after learning from the data
- Has <b>transform/fit_transform</b> method
- Example : One hot encoder, Standard scaler, PCA etc.

Clustering algorithms are pure estimators neither predictors nor transformers

##### Predictors (types)
1. Regression predictors : Predict function's output is numerical
2. Classification predictors : Has two methods -> (a) predict (b) predict_proba -> gives output in probability 

<b>Q:</b> What if I want to use an algorithm which is not is sklearn?
<b>A:</b> Custom Estimators

In [66]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y

In [68]:
# This is how you create a machine learning algorithm as an estimator

class MostFrequentClassClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        self.most_frequent_ = None

    def fit(self,X,y):  # This fit method is mandatory
        # Validate input X and target vector y
        X,y = check_X_y(X,y)

        # Ensure y is 1D
        y = np.ravel(y)
    
        # Manually compute the most frequent class
        unique_classes, counts = np.unique(y,return_counts=True)
        self.most_frequent_ = unique_classes[np.argmax(counts)]
        
        return self
        
    def predict(self,X):
        if self.most_frequent_ is None:
            raise ValueError('This classifier instance is not fitted yet...')
        # Predict the most frequent class for each input sample
        return np.full(shape=(X.shape[0],),fill_value=self.most_frequent_)

In [70]:
from sklearn.datasets import load_iris

# Load data
iris = load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [71]:
X,y = iris.data,iris.target

In [72]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [73]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [74]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

# Initialize and fit the custom estimator
classifier = MostFrequentClassClassifier()
classifier.fit(X_train,y_train)

# Make prediction
predictions = classifier.predict(X_test)

# Evaluate the custom estimator
print(f'Predicted class for all test instances: {predictions[0]}')

Predicted class for all test instances: 1


In [77]:
from sklearn.model_selection import cross_val_score
cross_val_score(classifier, X_train, y_train) # to use other things of sklearn, we inherited everything from BaseEstimator and ClassifierMixin in above class. It makes our estimator compatible with sklearn

array([0.34782609, 0.34782609, 0.31818182, 0.36363636, 0.36363636])

#### Mixins
- auxiliary classes
- provide additional methods and functionality to custom estimators

- <b>Scoring</b>: The <b>ClassifierMixin</b> and <b>RegressorMixin</b> provide a default implementation of the <b>score method</b> for classification and regression estimators, respectively
  
- <b>Transformation</b>: The <b>TransformerMixin</b> provides a <b>fit_transform</b> method, allowing transformation estimators to both learn and apply a transformation in a single step.
  
- <b>Fitting and Predicting</b>: The <b>ClusterMixin</b> adds the <b>fit_predict</b> method to clustering estimators, enabling them to fit to the data and then return cluster labels

#### Transformers:
- Used for transforming datasets.
- Designed to pre-process data, which include tasks such as scaling, encoding categorical variables, handling missing values, and feature extraction.
- Their main goal is to modify or create new features from the original dataset in a way that makes the data more suitable for modeling by ML algorithms.
- Contains <b>fit</b> and <b>transform(fit_transform)</b> method.
##### Disadvantages:
1. Can be applied only to the entire dataset
2. Can't cover every use case

In [78]:
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [81]:
# Generate some data
X,y=make_regression(n_samples=100, n_features=2, noise=0.1, random_state=42)

# Use the transformer directly
X_transformed = StandardScaler().fit_transform(X)
LinearRegression().fit(X_transformed,y)

#### Custom Transformers
- User defined transformers
- designed to carry out specific data transformation or pre-processing that are not available in the built-in sklearn transformers
##### Ways to create custom transformers:
1. <b>Function Approach (Function Transformers)</b> - for simpler transformers where you don't need to learn from data(stateless -> these transformers don't need to learn from data). Example: square, log, cos, exp etc.
2. <b>Class Approach (From BaseEstimator and TransformerMixin)</b> - for complex transformation where you need to learn from data

In [82]:
# StandardScaler needs to learn from data which is mean and std. deviation of data

##### Function transformer

In [83]:
def cube_transform(x):
    return np.power(x,3)

In [84]:
from sklearn.preprocessing import FunctionTransformer
# Create custom transformer
cube_transformer = FunctionTransformer(cube_transform)

In [89]:
# Generate some data
X,y=make_regression(n_samples=100, n_features=2, noise=0.1, random_state=42)

# Use the transformer directly
X_transformed = cube_transformer.transform(X)

# IMPORTANT::::
# X_transformed = cube_transform(X) : We could have done this as well na, like pass value in the function directly why to create a transformer?
# Answer: To integrate it with sklearn universe, to achieve other sklearn features

LinearRegression().fit(X_transformed,y)

##### Class Approach

In [92]:
from sklearn.base import TransformerMixin

In [98]:
# Constructor -> fit -> fit_transform -> transform 
class MedianQRScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.medians_ = None
        self.iqr_ = None
        
    def fit(self,X,y=None):
        # Calculate medians and interquartile range for each features 
        self.medians_ = np.median(X, axis=0)
        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0)
        self.iqr_ = Q3-Q1

        # Handle case where IQR is 0 to avoid division by zero during transform
        self.iqr_[self.iqr_ == 0]=1
        return self
    
    def transform(self,X):
        # Check if fit has been called
        if self.medians_ is None or self.iqr_ is None:
            raise RuntimeError("The transformer has not been fitted yet")
        # Scale features using median and IQR learned during fit
        return(X-self.medians_)/self.iqr_

In [99]:
from sklearn.datasets import make_blobs

X,_=make_blobs(n_samples=100, n_features=2, centers=3, random_state=42)

# Initialize the transformer
scaler = MedianQRScaler()

# Fit the scaler to the data
scaler.fit(X)

# Transform the data
X_scaled = scaler.transform(X)

# Check the first few rows of the transformed data
print('Transformed data (first 5 rows):')
print(X_scaled[:5])

Transformed data (first 5 rows):
[[-0.49872679 -0.71613207]
 [ 0.78423675 -0.08192868]
 [-0.03656645  0.52987512]
 [ 0.84159877 -0.09379661]
 [-0.3814692  -0.57206564]]


#### Composite Transformers
- Built from multiple other transformers or estimators, combining their functionalities to apply a series of transformation or processing steps in a specific way
- Types:
  1. Column Transformer
  2. Pipeline
  3. Feature Union

##### Column Transformer
- allows different columns or column subsets of the input dataset to be transformed separately and the features generated by each transformer to be concatenated into a single feature space.
- particularly useful for datasets that contain various types of data requiring different preprocessing steps, such as numerical data that needs scaling and categorical data that needs to be encoded.

In [100]:
# Define the data with numeric labels for sentiment
data = {
    "Social Media Platform": ["Twitter", "Facebook", "Instagram", "Twitter", "Facebook",
                              "Instagram", "Twitter", "Facebook", "Instagram", "Twitter"],
    "Review": ["Love the new update!", "Too many ads now", "Great for sharing photos",
               "Newsfeed algorithm is biased", "Privacy concerns with latest update",
               "Amazing filters!", "Too much spam", "Easy to connect with friends",
               "Stories feature is fantastic", "Customer support lacking"],
    "age": [21, 19, np.nan, 17, 24, np.nan, 30, 19, 16, 31],
    "Sentiment": [1, 0, 1, 0, 0, 1, 0, 1, 1, 0]  # Numeric labels: 1 for Positive, 0 for Negative
}

# Create a DataFrame
df = pd.DataFrame(data)

print(df)

  Social Media Platform                               Review   age  Sentiment
0               Twitter                 Love the new update!  21.0          1
1              Facebook                     Too many ads now  19.0          0
2             Instagram             Great for sharing photos   NaN          1
3               Twitter         Newsfeed algorithm is biased  17.0          0
4              Facebook  Privacy concerns with latest update  24.0          0
5             Instagram                     Amazing filters!   NaN          1
6               Twitter                        Too much spam  30.0          0
7              Facebook         Easy to connect with friends  19.0          1
8             Instagram         Stories feature is fantastic  16.0          1
9               Twitter             Customer support lacking  31.0          0


In [102]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [104]:
column_transformer = ColumnTransformer( # Takes a list as input
    transformers=[
        ('Platform_ohe',OneHotEncoder(),['Social Media Platform']), # name_of_transformer, the_transformation_to_be_applied, Column_name
        ('review_bow',CountVectorizer(),'Review'),
        ('age_impute',SimpleImputer(),['age'])
    ],
    remainder='drop' # Drop other columns not specified in transformers
)

In [105]:
pd.DataFrame(column_transformer.fit_transform(df).toarray(), columns=column_transformer.get_feature_names_out())

Unnamed: 0,Platform_ohe__Social Media Platform_Facebook,Platform_ohe__Social Media Platform_Instagram,Platform_ohe__Social Media Platform_Twitter,review_bow__ads,review_bow__algorithm,review_bow__amazing,review_bow__biased,review_bow__concerns,review_bow__connect,review_bow__customer,...,review_bow__sharing,review_bow__spam,review_bow__stories,review_bow__support,review_bow__the,review_bow__to,review_bow__too,review_bow__update,review_bow__with,age_impute__age
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,21.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.125
3,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,24.0
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.125
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,30.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,19.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,31.0


##### Feature Union
- applies multiple transformers -> <b>parallelly</b> -> on the entire dataset

In [106]:
# Generating a random dataset with 10 rows and 4 columns
np.random.seed(42)  # For reproducibility
data = np.random.randn(10, 4)

# Creating a DataFrame and naming the columns
df = pd.DataFrame(data, columns=['f1', 'f2', 'f3', 'y'])

df

Unnamed: 0,f1,f2,f3,y
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,-0.463418,-0.46573
3,0.241962,-1.91328,-1.724918,-0.562288
4,-1.012831,0.314247,-0.908024,-1.412304
5,1.465649,-0.225776,0.067528,-1.424748
6,-0.544383,0.110923,-1.150994,0.375698
7,-0.600639,-0.291694,-0.601707,1.852278
8,-0.013497,-1.057711,0.822545,-1.220844
9,0.208864,-1.95967,-1.328186,0.196861


In [107]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA

feature_union = FeatureUnion(
    [
        ('scaler',StandardScaler()), # Apply StandardScaler
        ('pca',PCA(n_components=2)) # Apply PCA, reduce to 2 components
    ]
)

In [108]:
X_transformed = feature_union.fit_transform(df.drop(columns=['y']))
pd.DataFrame(X_transformed,columns=feature_union.get_feature_names_out())

Unnamed: 0,scaler__f1,scaler__f2,scaler__f3,pca__pca0,pca__pca1
0,0.815293,0.41836,0.947878,-1.025659,-0.425413
1,-0.282292,0.302777,1.873701,-1.772532,-0.358223
2,-0.635686,1.239158,-0.156427,-0.327888,1.038742
3,0.432718,-1.721587,-1.410206,1.911072,-0.68996
4,-1.451676,0.963905,-0.598312,0.193153,1.371662
5,2.270396,0.312856,0.371269,-0.51176,-0.891133
6,-0.74818,0.718778,-0.839795,0.48428,1.020731
7,-0.832663,0.233387,-0.29387,0.191723,0.583958
8,0.04908,-0.690119,1.121664,-0.726878,-0.811461
9,0.383011,-1.777515,-1.015903,1.584488,-0.838903


##### Pipeline
- applies multiple transformation -> <b>sequentially</b> -> on the entire dataset 

In [109]:
# Generating a random dataset with 10 rows and 4 columns
np.random.seed(42)  # For reproducibility
data = np.random.randn(10, 4)

# Creating a DataFrame and naming the columns
df = pd.DataFrame(data, columns=['f1', 'f2', 'f3', 'y'])

df

Unnamed: 0,f1,f2,f3,y
0,0.496714,-0.138264,0.647689,1.52303
1,-0.234153,-0.234137,1.579213,0.767435
2,-0.469474,0.54256,-0.463418,-0.46573
3,0.241962,-1.91328,-1.724918,-0.562288
4,-1.012831,0.314247,-0.908024,-1.412304
5,1.465649,-0.225776,0.067528,-1.424748
6,-0.544383,0.110923,-1.150994,0.375698
7,-0.600639,-0.291694,-0.601707,1.852278
8,-0.013497,-1.057711,0.822545,-1.220844
9,0.208864,-1.95967,-1.328186,0.196861


In [110]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Apply StandardScaler
    ('pca', PCA(n_components=2))
])

In [111]:
pd.DataFrame(pipeline.fit_transform(X), columns=pipeline.get_feature_names_out())

Unnamed: 0,pca0,pca1
0,1.967289,0.189197
1,-0.980541,1.140597
2,-0.696192,-1.069669
3,-1.049030,1.246446
4,1.562347,0.140628
...,...,...
95,-0.673056,-1.107907
96,-0.960615,-0.890179
97,-1.014953,0.817791
98,1.401202,0.194052
