In [1]:
!pip install kaggle

import kagglehub

# Download latest version
path = kagglehub.dataset_download("swaptr/layoffs-2022")

print("Path to dataset files:", path)


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable


  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/swaptr/layoffs-2022?dataset_version_number=385...


100%|██████████| 213k/213k [00:00<00:00, 747kB/s]

Extracting files...
Path to dataset files: C:\Users\user\.cache\kagglehub\datasets\swaptr\layoffs-2022\versions\385





In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import os

file_name = 'layoffs.csv'
file_path = os.path.join(path, file_name)

# Check if the file exists before attempting to read it
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("DataFrame created successfully:")
    print(df.head()) # Print the first few rows to verify
else:
    print(f"File not found at: {file_path}")
    print("Please check the contents of the downloaded directory:")
    !ls {path}

# Show shape and first rows
print("Shape:", df.shape)
df.head()

DataFrame created successfully:
            company       location  total_laid_off       date  \
0         ConsenSys  New York City            47.0  7/22/2025   
1              Zeen    SF Bay Area             NaN  7/21/2025   
2  Rocket Companies        Detroit             NaN  7/18/2025   
3            Amazon        Seattle             NaN  7/17/2025   
4           Amicole  New York City             NaN  7/17/2025   

  percentage_laid_off     industry  \
0                  7%       Crypto   
1                100%     Consumer   
2                  2%  Real Estate   
3                 NaN       Retail   
4                100%       Retail   

                                              source     stage funds_raised  \
0  https://www.bloomberg.com/news/articles/2025-0...  Series D         $726   
1  https://www.businessinsider.com/social-media-c...   Unknown           $9   
2  https://www.housingwire.com/articles/rocket-la...  Post-IPO        $5200   
3  https://www.reuters.com/busin

Unnamed: 0,company,location,total_laid_off,date,percentage_laid_off,industry,source,stage,funds_raised,country,date_added
0,ConsenSys,New York City,47.0,7/22/2025,7%,Crypto,https://www.bloomberg.com/news/articles/2025-0...,Series D,$726,United States,7/22/2025
1,Zeen,SF Bay Area,,7/21/2025,100%,Consumer,https://www.businessinsider.com/social-media-c...,Unknown,$9,United States,7/22/2025
2,Rocket Companies,Detroit,,7/18/2025,2%,Real Estate,https://www.housingwire.com/articles/rocket-la...,Post-IPO,$5200,United States,7/22/2025
3,Amazon,Seattle,,7/17/2025,,Retail,https://www.reuters.com/business/retail-consum...,Post-IPO,$8100,United States,7/18/2025
4,Amicole,New York City,,7/17/2025,100%,Retail,https://techcrunch.com/2025/07/17/after-raisin...,Seed,$5,United States,7/18/2025


In [3]:
# models/pipeline.py

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# ------------------------------
# Custom Transformer for funds_raised
# ------------------------------

class CleanFundsTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer to clean funds_raised column:
    "$234" --> 234.0
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.apply(self._clean_value)

    def _clean_value(self, val):
        if pd.isna(val):
            return np.nan
        return float(str(val).replace("$", "").replace(",", ""))

# ------------------------------
# Function to build the pipeline
# ------------------------------

def build_preprocessing_pipeline():

    # -------------------------
    # Define columns
    # -------------------------

    numeric_features = [
        "total_laid_off",
        "perc_laid_off",
        "funds_raised"
    ]

    categorical_features = [
        "industry",
        "country",
        "stage"
    ]

    # -------------------------
    # Define numeric pipeline
    # -------------------------

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # -------------------------
    # Define categorical pipeline
    # -------------------------

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

    # -------------------------
    # Define transformers for specific columns
    # -------------------------

    # For funds_raised → custom cleaning + numeric pipeline
    funds_pipeline = Pipeline([
        ("cleaner", CleanFundsTransformer()),
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", StandardScaler())
    ])

    # -------------------------
    # Combine everything
    # -------------------------

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_pipeline, ["total_laid_off", "perc_laid_off"]),
        ("funds", funds_pipeline, ["funds_raised"]),
        ("cat", categorical_pipeline, categorical_features)
    ])

    return preprocessor


In [4]:
# models/pipeline.py

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# ------------------------------
# Custom Transformer for percentage_laid_off
# ------------------------------

class CleanPercentageTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer to clean percentage_laid_off column:
    "25%" --> 0.25
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.applymap(self._clean_value)

    def _clean_value(self, val):
        if pd.isna(val):
            return np.nan
        try:
            return float(str(val).replace("%", "")) / 100
        except ValueError:
            return np.nan


# ------------------------------
# Custom Transformer for funds_raised
# ------------------------------

class CleanFundsTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer to clean funds_raised column:
    "$234" --> 234.0
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.applymap(self._clean_value)

    def _clean_value(self, val):
        if pd.isna(val):
            return np.nan
        return float(str(val).replace("$", "").replace(",", ""))

# ------------------------------
# Function to build the pipeline
# ------------------------------

def build_preprocessing_pipeline():

    # -------------------------
    # Define columns
    # -------------------------

    numeric_features = [
        "total_laid_off",
        "percentage_laid_off",
        "funds_raised"
    ]

    categorical_features = [
        "industry",
        "country",
        "stage"
    ]

    # -------------------------
    # Define numeric pipeline
    # -------------------------

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # -------------------------
    # Define categorical pipeline
    # -------------------------

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    # -------------------------
    # Define transformers for specific columns
    # -------------------------

    # For percentage_laid_off → custom cleaning + numeric pipeline
    percentage_pipeline = Pipeline([
        ("cleaner", CleanPercentageTransformer()),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # For funds_raised → custom cleaning + numeric pipeline
    funds_pipeline = Pipeline([
        ("cleaner", CleanFundsTransformer()),
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", StandardScaler())
    ])

    # -------------------------
    # Combine everything
    # -------------------------

    preprocessor = ColumnTransformer(transformers=[
        ("total_laid_off", numeric_pipeline, ["total_laid_off"]),
        ("percentage_laid_off", percentage_pipeline, ["percentage_laid_off"]),
        ("funds", funds_pipeline, ["funds_raised"]),
        ("cat", categorical_pipeline, categorical_features)
    ])

    return preprocessor

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Create layoff_severity column
median_laid_off = df['total_laid_off'].median()
df['layoff_severity'] = (df['total_laid_off'] > median_laid_off).astype(int)

# Assuming you already added the layoff_severity column from earlier steps
y = df["layoff_severity"]
X = df[[
    "total_laid_off",
    "percentage_laid_off",
    "funds_raised",
    "industry",
    "country",
    "stage"
]]

# Build preprocessing
preprocessor = build_preprocessing_pipeline()

# Define full pipeline
clf_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier())
])

# Train
clf_pipeline.fit(X, y)

  return X.applymap(self._clean_value)
  return X.applymap(self._clean_value)


In [8]:
import joblib
joblib.dump(clf_pipeline, "../api/trained_model/layoff_pipeline.joblib")

['../api/trained_model/layoff_pipeline.joblib']