# Load dataset

In [3]:
import kagglehub
import pandas as pd

# Download latest version
path = kagglehub.dataset_download("dagloxkankwanda/startup-failures")

print("Path to dataset files:", path)

Path to dataset files: /Users/ahmeddriouech/.cache/kagglehub/datasets/dagloxkankwanda/startup-failures/versions/14


In [10]:
df_finance = pd.read_csv(f'{path}/Startup Failure (Finance and Insurance).csv', on_bad_lines='skip')
df_health = pd.read_csv(f'{path}/Startup Failure (Health Care).csv', on_bad_lines='skip')
df_production = pd.read_csv(f'{path}/Startup Failure (Manufactures).csv', on_bad_lines='skip')
df_commerce = pd.read_csv(f'{path}/Startup Failure (Retail Trade).csv', on_bad_lines='skip')
df_media = pd.read_csv(f'{path}/Startup Failures (Information Sector).csv', on_bad_lines='skip')

datasets = {
    "Finance And Insurance": df_finance,
    "HealthCare": df_health,
    "Manufacturers": df_production,
    "Retail Trade": df_commerce,
    "Information Sector": df_media
}

columns_list = [set(df.columns) for df in datasets.values()]
if all(cols == columns_list[0] for cols in columns_list):
    print("All datasets have the same columns")
    df = pd.concat(datasets.values(), ignore_index=True)
    print("Merged dataset shape:", df.shape)
else:
    print("Datasets do not have the same columns")

All datasets have the same columns
Merged dataset shape: (383, 20)


# Preprocess Data

In [11]:
df_startup = pd.read_csv(f'{path}/Startup Failures.csv')

for index, row in df.iterrows():
    matching_rows = df_startup[(df_startup['Years of Operation'] == row['Name']) | (df_startup['Years of Operation'] == row['Sector'])]

    if not matching_rows.empty:
        df.at[index, 'Name'] = matching_rows['Name'].values[0]
        df.at[index, 'Sector'] = matching_rows['Sector'].values[0]

In [12]:
print("NAN Values in Dataset: \n", df.isnull().sum())

NAN Values in Dataset: 
 Name                      0
Sector                    0
Years of Operation        0
What They Did             0
How Much They Raised      0
Why They Failed           0
Takeaway                  0
Giants                    0
No Budget                 0
Competition               0
Poor Market Fit           0
Acquisition Stagnation    0
Platform Dependency       0
Monetization Failure      0
Niche Limits              0
Execution Flaws           0
Trend Shifts              0
Toxicity/Trust Issues     0
Regulatory Pressure       0
Overhype                  1
dtype: int64


In [19]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna('Unknown')
    else:
        df[col] = df[col].fillna(df[col].mean())

In [25]:
df.drop(['Name'], axis=1, inplace=True)

In [30]:
df_train = df.copy()

categorical_features = ['Sector', 'Years of Operation', 'What They Did', 'How Much They Raised', 'Why They Failed', 'Takeaway']

df_train = pd.get_dummies(df_train, columns=categorical_features)

In [None]:
df_train.drop(['Execution Flaws'], axis=1, inplace=True)

In [31]:
import pickle

with open('df_train.pkl', 'wb') as f:
    pickle.dump(df_train, f)

In [None]:
X = df_train.drop(['Overhype'], axis=1)
Y = df['Overhype']
Y = Y.apply(lambda x: x if x in [0, 1] else 0)