# ML Pipeline generation and execution on train data and testing the accuracy.
### from Google gemini for data that needs ligistic regression, this pipeline was generated.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# --- 1. Create Sample Data ---
print("--- Generating Sample Data ---")
data = {
    'Age': np.random.randint(20, 60, 100),
    'Salary': np.random.randint(30000, 120000, 100),
    'City': np.random.choice(['New York', 'London', 'Paris', 'Tokyo'], 100),
    'ExperienceLevel': np.random.choice(['Junior', 'Mid', 'Senior'], 100), # Ordinal feature
    'Target': np.random.choice([0, 1], 100)
}
df = pd.DataFrame(data)

# Define features (X) and target (y)
X = df.drop('Target', axis=1)
y = df['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print("-" * 30)


# --- 2. Define Feature Types ---
# Numerical features to be Standardized
numerical_features = ['Age', 'Salary']

# Nominal categorical features to be One-Hot Encoded
nominal_categorical_features = ['City']

# Ordinal categorical features to be Ordinal Encoded (like a more structured Label Encoding)
# We define the order manually
experience_categories = ['Junior', 'Mid', 'Senior']
ordinal_categorical_features = ['ExperienceLevel']


# --- 3. Create Preprocessing Steps (ColumnTransformer) ---
# 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features), # 1. Standardization
        ('cat_ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_categorical_features), # 2. One-Hot Encoding (for Nominal)
        ('cat_ord', OrdinalEncoder(categories=[experience_categories]), ordinal_categorical_features) # 3. Ordinal Encoding (for Ordinal)
    ],
    remainder='passthrough' # Keep any other columns as they are
)


# --- 4. Create the Full ML Pipeline ---
# The pipeline chains the preprocessor with the final estimator (model)
# 

[Image of ML Pipeline diagram with steps: Preprocessing -> Model]

ml_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

print("--- ML Pipeline Structure ---")
print(ml_pipeline)
print("-" * 30)

# --- 5. Train the Pipeline ---
print("--- Training the Pipeline on X_train and y_train ---")
# Calling .fit() on the pipeline performs:
# 1. .fit() on StandardScaler and Encoders on X_train (learning mean/std, categories, etc.)
# 2. .transform() on X_train
# 3. .fit() on LogisticRegression on the transformed data
ml_pipeline.fit(X_train, y_train)
print("Training complete.")
print("-" * 30)

# --- 6. Use the Pipeline for Prediction on Test Data ---
print("--- Making Predictions on X_test ---")
# Calling .predict() on the pipeline performs:
# 1. .transform() on X_test using the parameters learned from X_train
# 2. .predict() using the trained LogisticRegression model
y_pred = ml_pipeline.predict(X_test)
print("Prediction complete.")

# --- 7. Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
print("-" * 30)
print(f"**Test Data Accuracy: {accuracy:.4f}**")

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# --- 1. Create Sample Data ---
print("--- Generating Sample Data ---")
data = {
    'Age': np.random.randint(20, 60, 100),
    'Salary': np.random.randint(30000, 120000, 100),
    'City': np.random.choice(['New York', 'London', 'Paris', 'Tokyo'], 100),
    'ExperienceLevel': np.random.choice(['Junior', 'Mid', 'Senior'], 100), # Ordinal feature
    'Target': np.random.choice([0, 1], 100)
}
df = pd.DataFrame(data)

# Define features (X) and target (y)
X = df.drop('Target', axis=1)
y = df['Target']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print("-" * 30)


# --- 2. Define Feature Types ---
# Numerical features to be Standardized
numerical_features = ['Age', 'Salary']

# Nominal categorical features to be One-Hot Encoded
nominal_categorical_features = ['City']

# Ordinal categorical features to be Ordinal Encoded (like a more structured Label Encoding)
# We define the order manually
experience_categories = ['Junior', 'Mid', 'Senior']
ordinal_categorical_features = ['ExperienceLevel']


# --- 3. Create Preprocessing Steps (ColumnTransformer) ---
# 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features), # 1. Standardization
        ('cat_ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_categorical_features), # 2. One-Hot Encoding (for Nominal)
        ('cat_ord', OrdinalEncoder(categories=[experience_categories]), ordinal_categorical_features) # 3. Ordinal Encoding (for Ordinal)
    ],
    remainder='passthrough' # Keep any other columns as they are
)


# --- 4. Create the Full ML Pipeline ---
# The pipeline chains the preprocessor with the final estimator (model)
# 

# [Image of ML Pipeline diagram with steps: Preprocessing -> Model]

ml_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

print("--- ML Pipeline Structure ---")
print(ml_pipeline)
print("-" * 30)

# --- 5. Train the Pipeline ---
print("--- Training the Pipeline on X_train and y_train ---")
# Calling .fit() on the pipeline performs:
# 1. .fit() on StandardScaler and Encoders on X_train (learning mean/std, categories, etc.)
# 2. .transform() on X_train
# 3. .fit() on LogisticRegression on the transformed data
ml_pipeline.fit(X_train, y_train)
print("Training complete.")
print("-" * 30)

# --- 6. Use the Pipeline for Prediction on Test Data ---
print("--- Making Predictions on X_test ---")
# Calling .predict() on the pipeline performs:
# 1. .transform() on X_test using the parameters learned from X_train
# 2. .predict() using the trained LogisticRegression model
y_pred = ml_pipeline.predict(X_test)
print("Prediction complete.")

# --- 7. Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
print("-" * 30)
print(f"**Test Data Accuracy: {accuracy:.4f}**")

--- Generating Sample Data ---
X_train shape: (80, 4)
X_test shape: (20, 4)
------------------------------
--- ML Pipeline Structure ---
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  ['Age', 'Salary']),
                                                 ('cat_ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse_output=False),
                                                  ['City']),
                                                 ('cat_ord',
                                                  OrdinalEncoder(categories=[['Junior',
                                                                              'Mid',
                                                                              'Senior']]),
   