In [None]:
!pip install kaggle

import kagglehub

# Download latest version
path = kagglehub.dataset_download("swaptr/layoffs-2022")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/layoffs-2022


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import os

file_name = 'layoffs.csv'
file_path = os.path.join(path, file_name)

# Check if the file exists before attempting to read it
if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    print("DataFrame created successfully:")
    print(df.head()) # Print the first few rows to verify
else:
    print(f"File not found at: {file_path}")
    print("Please check the contents of the downloaded directory:")
    !ls {path}

# Show shape and first rows
print("Shape:", df.shape)
df.head()

DataFrame created successfully:
              company            location  total_laid_off       date  \
0  Indeed + Glassdoor              Austin          1300.0  7/10/2025   
1          Eigen Labs             Seattle            29.0   7/8/2025   
2            Subtl AI  Hyderabad,Non-U.S.             NaN   7/3/2025   
3           Microsoft             Seattle          9000.0   7/2/2025   
4                Okra      Lagos,Non-U.S.             NaN   7/2/2025   

  percentage_laid_off industry  \
0                 NaN       HR   
1                 25%   Crypto   
2                100%       AI   
3                  4%    Other   
4                100%  Finance   

                                              source     stage funds_raised  \
0  https://www.bloomberg.com/news/articles/2025-0...  Acquired           $5   
1  https://blockworks.co/news/eigen-layoffs-25-ei...   Unknown         $234   
2  https://inc42.com/buzz/genai-startup-subtl-ai-...      Seed          NaN   
3  https://www

Unnamed: 0,company,location,total_laid_off,date,percentage_laid_off,industry,source,stage,funds_raised,country,date_added
0,Indeed + Glassdoor,Austin,1300.0,7/10/2025,,HR,https://www.bloomberg.com/news/articles/2025-0...,Acquired,$5,United States,7/10/2025
1,Eigen Labs,Seattle,29.0,7/8/2025,25%,Crypto,https://blockworks.co/news/eigen-layoffs-25-ei...,Unknown,$234,United States,7/11/2025
2,Subtl AI,"Hyderabad,Non-U.S.",,7/3/2025,100%,AI,https://inc42.com/buzz/genai-startup-subtl-ai-...,Seed,,India,7/7/2025
3,Microsoft,Seattle,9000.0,7/2/2025,4%,Other,https://www.cnbc.com/2025/07/02/microsoft-layi...,Post-IPO,$1,United States,7/2/2025
4,Okra,"Lagos,Non-U.S.",,7/2/2025,100%,Finance,https://techpoint.africa/news/okra-cofounder-f...,Series A,$12,Nigeria,7/7/2025


In [None]:
# models/pipeline.py

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# ------------------------------
# Custom Transformer for funds_raised
# ------------------------------

class CleanFundsTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer to clean funds_raised column:
    "$234" --> 234.0
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.apply(self._clean_value)

    def _clean_value(self, val):
        if pd.isna(val):
            return np.nan
        return float(str(val).replace("$", "").replace(",", ""))

# ------------------------------
# Function to build the pipeline
# ------------------------------

def build_preprocessing_pipeline():

    # -------------------------
    # Define columns
    # -------------------------

    numeric_features = [
        "total_laid_off",
        "perc_laid_off",
        "funds_raised"
    ]

    categorical_features = [
        "industry",
        "country",
        "stage"
    ]

    # -------------------------
    # Define numeric pipeline
    # -------------------------

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # -------------------------
    # Define categorical pipeline
    # -------------------------

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])

    # -------------------------
    # Define transformers for specific columns
    # -------------------------

    # For funds_raised → custom cleaning + numeric pipeline
    funds_pipeline = Pipeline([
        ("cleaner", CleanFundsTransformer()),
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", StandardScaler())
    ])

    # -------------------------
    # Combine everything
    # -------------------------

    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_pipeline, ["total_laid_off", "perc_laid_off"]),
        ("funds", funds_pipeline, ["funds_raised"]),
        ("cat", categorical_pipeline, categorical_features)
    ])

    return preprocessor


In [None]:
# models/pipeline.py

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# ------------------------------
# Custom Transformer for percentage_laid_off
# ------------------------------

class CleanPercentageTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer to clean percentage_laid_off column:
    "25%" --> 0.25
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.applymap(self._clean_value)

    def _clean_value(self, val):
        if pd.isna(val):
            return np.nan
        try:
            return float(str(val).replace("%", "")) / 100
        except ValueError:
            return np.nan


# ------------------------------
# Custom Transformer for funds_raised
# ------------------------------

class CleanFundsTransformer(BaseEstimator, TransformerMixin):
    """
    Transformer to clean funds_raised column:
    "$234" --> 234.0
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        return X.applymap(self._clean_value)

    def _clean_value(self, val):
        if pd.isna(val):
            return np.nan
        return float(str(val).replace("$", "").replace(",", ""))

# ------------------------------
# Function to build the pipeline
# ------------------------------

def build_preprocessing_pipeline():

    # -------------------------
    # Define columns
    # -------------------------

    numeric_features = [
        "total_laid_off",
        "percentage_laid_off",
        "funds_raised"
    ]

    categorical_features = [
        "industry",
        "country",
        "stage"
    ]

    # -------------------------
    # Define numeric pipeline
    # -------------------------

    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # -------------------------
    # Define categorical pipeline
    # -------------------------

    categorical_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    # -------------------------
    # Define transformers for specific columns
    # -------------------------

    # For percentage_laid_off → custom cleaning + numeric pipeline
    percentage_pipeline = Pipeline([
        ("cleaner", CleanPercentageTransformer()),
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    # For funds_raised → custom cleaning + numeric pipeline
    funds_pipeline = Pipeline([
        ("cleaner", CleanFundsTransformer()),
        ("imputer", SimpleImputer(strategy="constant", fill_value=0.0)),
        ("scaler", StandardScaler())
    ])

    # -------------------------
    # Combine everything
    # -------------------------

    preprocessor = ColumnTransformer(transformers=[
        ("total_laid_off", numeric_pipeline, ["total_laid_off"]),
        ("percentage_laid_off", percentage_pipeline, ["percentage_laid_off"]),
        ("funds", funds_pipeline, ["funds_raised"]),
        ("cat", categorical_pipeline, categorical_features)
    ])

    return preprocessor

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Create layoff_severity column
median_laid_off = df['total_laid_off'].median()
df['layoff_severity'] = (df['total_laid_off'] > median_laid_off).astype(int)

# Assuming you already added the layoff_severity column from earlier steps
y = df["layoff_severity"]
X = df[[
    "total_laid_off",
    "percentage_laid_off",
    "funds_raised",
    "industry",
    "country",
    "stage"
]]

# Build preprocessing
preprocessor = build_preprocessing_pipeline()

# Define full pipeline
clf_pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", RandomForestClassifier())
])

# Train
clf_pipeline.fit(X, y)

  return X.applymap(self._clean_value)
  return X.applymap(self._clean_value)


In [None]:
import joblib
joblib.dump(clf_pipeline, "models/layoff_pipeline.joblib")