In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import plotly as pl 
import seaborn as sns 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [6]:
# Sample data
data = {
    'age': [25, 30, None, 40],
    'income': [50000, 60000, 70000, np.nan],
    'gender': ['M', 'F', 'M', 'F'],
    'city': ['NY', 'LA', 'NY', 'SF'],
    'review': ['Great product!', 'Not bad', 'Amazing', 'Could be better'],
    'target': [1, 0, 1, 0]
}
df = pd.DataFrame(data)
df

Unnamed: 0,age,income,gender,city,review,target
0,25.0,50000.0,M,NY,Great product!,1
1,30.0,60000.0,F,LA,Not bad,0
2,,70000.0,M,NY,Amazing,1
3,40.0,,F,SF,Could be better,0


In [7]:
# Split data into features (X) and target (y)
X = df.iloc[:, :-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [8]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='mean')),
    ("scalar", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_transformer = Pipeline(steps=[
    ("tfidf", TfidfVectorizer())
])

In [9]:
preprocessor = ColumnTransformer(
    transformers= [
        ("num", numerical_transformer, ['age', 'income']),
        ("cat", categorical_transformer, ["gender", "city"]),
        ("text", text_transformer, "review")
    ],
    remainder="drop"
)

In [10]:
# full Pipeline
model_pipeline1 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier())
])

model_pipeline1.fit(X_train, y_train)
y_pred = model_pipeline1.predict(X_test)
print(f"Model1 Output: {y_pred}")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Model1 Output: [1]
Accuracy: 0.00


In [11]:
from sklearn.linear_model import LogisticRegression
model_pipeline2 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

model_pipeline2.fit(X_train, y_train)
y_pred = model_pipeline2.predict(X_test)
print(f"Model1 Output: {y_pred}")


Model1 Output: [0.54258675]


## Example 2

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

# Sample data
data = {
    'age': [25, 30, None, 40, 50, 60, 70, 80],
    'income': [50000, 60000, 70000, None, 80000, 90000, 100000, 110000],
    'gender': ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'F'],
    'city': ['NY', 'LA', 'NY', 'SF', 'NY', 'LA', 'NY', 'SF'],
    'review': ['Great product!', 'Not bad', 'Amazing', 'Could be better', 'Loved it!', 'Okay', 'Fantastic', 'Disappointing'],
    'target': [1, 0, 1, 0, 1, 0, 1, 0]  # Imbalanced target (4:4 ratio in this small example)
}
df = pd.DataFrame(data)

# Split data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df

Unnamed: 0,age,income,gender,city,review,target
0,25.0,50000.0,M,NY,Great product!,1
1,30.0,60000.0,F,LA,Not bad,0
2,,70000.0,M,NY,Amazing,1
3,40.0,,F,SF,Could be better,0
4,50.0,80000.0,M,NY,Loved it!,1
5,60.0,90000.0,F,LA,Okay,0
6,70.0,100000.0,M,NY,Fantastic,1
7,80.0,110000.0,F,SF,Disappointing,0


In [13]:
num_trans = Pipeline(steps=[
    ("num", SimpleImputer(strategy="mean")),
    ("scalar", StandardScaler())
])

cat_trans = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_trans = Pipeline(steps=[
    ("tfidf", TfidfVectorizer())
])

# Let's do preprocessing
preprocessor = ColumnTransformer(
    transformers= [
        ("num", num_trans, ["age", "income"]),
        ("cat", cat_trans, ["gender", "city"]),
        ("text", text_trans, "review")
    ], 
    remainder= "drop"
)

# Create full pipeline
model1 = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier())
])

# data1 = preprocessor.fit_transform(pd.DataFrame({
#     'age': [60],
#     'income': [90000],
#     'gender': ['F'],
#     'city': ['LA'],
#     'review': ['Okay']
# }))
# data1
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)
print(f"Model Prediction: {y_pred}")

Model Prediction: [1 0]


## Example

In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
# Sample data
data = {
    'age': [25, 30, None, 40, 50, 60, 70, 80],
    'income': [50000, 60000, 70000, None, 80000, 90000, 100000, 110000],
    'gender': ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'F'],
    'city': ['NY', 'LA', 'NY', 'SF', 'NY', 'LA', 'NY', 'SF'],
    'review': ['Great product!', 'Not bad', 'Amazing', 'Could be better', 'Loved it!', 'Okay', 'Fantastic', 'Disappointing'],
    'target': [1, 0, 1, 0, 1, 0, 1, 0]  # Imbalanced target (4:4 ratio in this small example)
}
df = pd.DataFrame(data)
df

Unnamed: 0,age,income,gender,city,review,target
0,25.0,50000.0,M,NY,Great product!,1
1,30.0,60000.0,F,LA,Not bad,0
2,,70000.0,M,NY,Amazing,1
3,40.0,,F,SF,Could be better,0
4,50.0,80000.0,M,NY,Loved it!,1
5,60.0,90000.0,F,LA,Okay,0
6,70.0,100000.0,M,NY,Fantastic,1
7,80.0,110000.0,F,SF,Disappointing,0


In [18]:
# Numerical Transform
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Categorical transformer
categorical_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scalar", OneHotEncoder(handle_unknown="ignore")) 
])
# Text transformer
text_transformer = Pipeline(steps=[
    ("tfidf", TfidfVectorizer())
])

# Create ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, ["age", "income"]),
        ("cat", categorical_transformer, ["gender", "city"]),
        ("text", text_transformer, "review")
],
    remainder="drop"
)

In [26]:
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    # ("smote", SMOTE(random_state=42)),
    ("classifier", RandomForestClassifier())
])

In [29]:
# Now siplit train and test data

# Split data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




# Fit the pipeline
model_pipeline.fit(X_train, y_train)

# # Make predictions
y_pred = model_pipeline.predict(X_test)

# # Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[0 2]
 [0 0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
