In [None]:
!pip install -q pandas numpy matplotlib seaborn scikit-learn missingno category_encoders

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded[list(uploaded.keys())[0]]))
df.head()

In [None]:
df.info()
df.describe(include='all')
df.isnull().sum()

visualize missing values

In [None]:
msno.matrix(df)
plt.show()

imputation

In [None]:
num_cols = df.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
df[num_cols] = imputer.fit_transform(df[num_cols])

isolation forest

In [None]:
iso = IsolationForest(contamination=0.01)
yhat = iso.fit_predict(df[num_cols])

df = df[yhat != -1]
outlier_count = np.sum(outlier_preds == -1)
print(f"Number of outliers detected: {outlier_count}")

one hot encoder

In [None]:
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    if df[col].nunique() == 2:
        df[col] = le.fit_transform(df[col])

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday

In [None]:
target = 'your_target_column_here'  # ðŸ‘ˆ change this!
X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

automated preprocessing

In [None]:
!pip install -q pycaret

from pycaret.classification import setup, compare_models


setup(data=df, target=target, session_id=123)
best = compare_models()


Imputing Missing Values in Categorical Columns


In [None]:
from sklearn.impute import SimpleImputer

cat_cols = df.select_dtypes(include='object').columns
cat_imputer = SimpleImputer(strategy='most_frequent')

df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

Multicollinearity Detection (Remove Redundant Features)


In [None]:
import numpy as np

corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_cols = [col for col in upper.columns if any(upper[col] > 0.9)]

print("Highly correlated columns to consider dropping:", high_corr_cols)

# Optional: Drop them
df.drop(columns=high_corr_cols, inplace=True)

Handle Class Imbalance (with SMOTE)

In [None]:
!pip install imbalanced-learn --quiet
from imblearn.over_sampling import SMOTE

# Split features and target first (if not already done)
X = df.drop('target_column', axis=1)  # replace 'target_column'
y = df['target_column']

# Balance the classes using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Optional: confirm new class counts
pd.Series(y_resampled).value_counts()


Use Scikit-learn Pipelines (for clean, chainable preprocessing)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Separate numeric and categorical
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include='object').columns

# Define transformers
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine them
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Apply transformation
X_transformed = preprocessor.fit_transform(X)


Automated Feature Selection (SelectKBest)


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Keep top 10 features (you can change this number)
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_transformed, y_resampled)

# To see which features were selected:
selected_indices = selector.get_support(indices=True)
selected_feature_names = np.array(preprocessor.get_feature_names_out())[selected_indices]
print("Top selected features:", selected_feature_names)