In [1]:
import pandas as pd
df = pd.read_csv("Layoffs_2022-2026.csv")
df.head()

Unnamed: 0,Company,Industry,Laid_Off_Count,Date,Country
0,Hailo,Semiconductor Manufacturing,30.0,2026-01-07,"Tel Aviv-Yafo, IL"
1,TailwindCSS,Software Development,3.0,2026-01-07,"Oklahoma City, US"
2,Ubisoft,Computer Games,,2026-01-06,France
3,Cloudhead Games,Computer games,,2026-01-06,"Qualicum Beach, CA"
4,Sapiens,Finance,700.0,2025-12-28,Israel


In [2]:
df.describe()

Unnamed: 0,Laid_Off_Count
count,2779.0
mean,288.526808
std,1002.921771
min,3.0
25%,40.0
50%,87.0
75%,200.0
max,22000.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4251 entries, 0 to 4250
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Company         4251 non-null   object 
 1   Industry        4249 non-null   object 
 2   Laid_Off_Count  2779 non-null   float64
 3   Date            4251 non-null   object 
 4   Country         4249 non-null   object 
dtypes: float64(1), object(4)
memory usage: 166.2+ KB


In [4]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
4246    False
4247    False
4248    False
4249    False
4250    False
Length: 4251, dtype: bool

In [5]:
df.nunique()

Company           2860
Industry            34
Laid_Off_Count     370
Date              1149
Country             69
dtype: int64

In [6]:
df.shape

(4251, 5)

In [7]:
print(df.isnull().sum())

Company              0
Industry             2
Laid_Off_Count    1472
Date                 0
Country              2
dtype: int64


# Handling Null Values

In [8]:
df['Country'] = df['Country'].fillna('Other')
df['Industry'] = df['Industry'].fillna('Other')


In [9]:
print(df.isnull().sum())

Company              0
Industry             0
Laid_Off_Count    1472
Date                 0
Country              0
dtype: int64


In [10]:
#replacing with 0 value in laid_off_count

df['Laid_Off_Count'] = df['Laid_Off_Count'].fillna(0).astype(int)

# Checking industry and country names if they have similarities

In [11]:
print(df['Industry'].unique())

['Semiconductor Manufacturing' 'Software Development' 'Computer Games'
 'Computer games' 'Finance' 'Support' 'Marketing' 'Retail' 'Education'
 'Manufacturing' 'Food' 'Other' 'Sales' 'Consumer' 'Hardware'
 'Transportation' 'Energy' 'Security' 'Healthcare' 'HR' 'Travel'
 'Logistics' 'Legal' 'Recruiting' 'AI' 'Data' 'Infrastructure' 'Media'
 'Fitness' 'Crypto' 'Real Estate' 'Product' 'Aerospace' 'Construction']


In [12]:
df['Industry'] = df['Industry'].astype(str).str.strip().str.title()

In [13]:
print(df['Industry'].unique())

['Semiconductor Manufacturing' 'Software Development' 'Computer Games'
 'Finance' 'Support' 'Marketing' 'Retail' 'Education' 'Manufacturing'
 'Food' 'Other' 'Sales' 'Consumer' 'Hardware' 'Transportation' 'Energy'
 'Security' 'Healthcare' 'Hr' 'Travel' 'Logistics' 'Legal' 'Recruiting'
 'Ai' 'Data' 'Infrastructure' 'Media' 'Fitness' 'Crypto' 'Real Estate'
 'Product' 'Aerospace' 'Construction']


In [15]:
categorical_alignment = {
    'Ai': 'AI',
    'Hr': 'HR & Talent Acquisition',
    'Recruiting': 'HR & Talent Acquisition',
    'Fintech': 'Finance',
    'Computer Games': 'Gaming',
    'Software Development': 'IT Infrastructure',
    'Infrastructure': 'IT Infrastructure',
    'Support': 'IT Infrastructure',
    'Semiconductor Manufacturing': 'Manufacturing', 
    'Sales': 'Marketing and Sales',
    'Marketing':'Marketing and Sales',
    'Real Estate': 'Real Estate & Property tech',
    'Construction': 'Real Estate & Property tech',
    'Logistics'  :	'Logistics & Transportation',
    'Transportation': 'Logistics & Transportation'
}

In [16]:

df['Industry'] = df['Industry'].replace(categorical_alignment)

In [17]:
df['Country'] = df['Country'].astype(str).str.strip()

In [20]:
fix_country_names = {
    'Tel Aviv-Yafo, IL':'Israel',
    'Oklahoma City, US':'United States',
    'Qualicum Beach, CA':'Canada'
}

df['Country'] = df['Country'].replace(fix_country_names) 

In [22]:
df.head()

Unnamed: 0,Company,Industry,Laid_Off_Count,Date,Country
0,Hailo,Manufacturing,30,2026-01-07,Israel
1,TailwindCSS,IT Infrastructure,3,2026-01-07,United States
2,Ubisoft,Gaming,0,2026-01-06,France
3,Cloudhead Games,Gaming,0,2026-01-06,Canada
4,Sapiens,Finance,700,2025-12-28,Israel


# Feature Engineering

In [23]:
# 1. Calculate Total Layoffs per Industry
total_layoffs = df.groupby('Industry')['Laid_Off_Count'].sum().reset_index()

# 2. Define Thresholds (Using 33rd and 66th percentiles)
low_limit = total_layoffs['Laid_Off_Count'].quantile(0.33)
high_limit = total_layoffs['Laid_Off_Count'].quantile(0.66)

# 3. Apply Risk Logic
def get_risk(volume):
    if volume <= low_limit: return 'Low'
    elif volume <= high_limit: return 'Medium'
    return 'High'

total_layoffs['Risk_Level'] = total_layoffs['Laid_Off_Count'].apply(get_risk)

# 4. Map back to main data
df = df.merge(total_layoffs[['Industry', 'Risk_Level']], on='Industry', how='left')

In [24]:
df.head()

Unnamed: 0,Company,Industry,Laid_Off_Count,Date,Country,Risk_Level
0,Hailo,Manufacturing,30,2026-01-07,Israel,Low
1,TailwindCSS,IT Infrastructure,3,2026-01-07,United States,Medium
2,Ubisoft,Gaming,0,2026-01-06,France,Low
3,Cloudhead Games,Gaming,0,2026-01-06,Canada,Low
4,Sapiens,Finance,700,2025-12-28,Israel,High


In [25]:
df.to_csv("Layoffs_Transformed_data.csv", index=False)