In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_excel('/content/startup_success_dataset_template (1).xlsx')

In [3]:
data.head()

Unnamed: 0,startup_name,founded_year,city,industry,has_online_presence,num_founders,num_employees,initial_funding,got_external_funding,startup_stage,...,founder_education,previous_startup_experience,founder_experience_years,market_competition_level,incubator_support,num_direct_competitors,market_need_level,is_successful,years_to_failure,made_profit_in_3y
0,MNT-Halan,2017,Cairo,Fintech,1,3,500+,30e3 USD,1,Market,...,—,—,—,High,1.0,Many,Strong,1,—,1
1,Paymob,2015,Cairo,Fintech,1,3,200+,—,1,Market,...,—,—,—,High,1.0,Many,Strong,1,—,1
2,Bosta,2017?,Cairo,Logistics,1,2,300+,—,1,Market,...,—,—,—,High,1.0,Many,Strong,1,—,1
3,Swvl,2017,Cairo,Transportation,1,3,600,30e3 USD,1,Market,...,—,—,—,High,1.0,Many,Strong,1,—,1
4,Breadfast,2017,Cairo,F&B/Ecom,1,3,150+,—,1,Market,...,—,—,—,Medium,1.0,Many,Strong,1,—,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   startup_name                 100 non-null    object 
 1   founded_year                 100 non-null    object 
 2   city                         100 non-null    object 
 3   industry                     100 non-null    object 
 4   has_online_presence          100 non-null    int64  
 5   num_founders                 100 non-null    object 
 6   num_employees                100 non-null    object 
 7   initial_funding              100 non-null    object 
 8   got_external_funding         100 non-null    object 
 9   startup_stage                100 non-null    object 
 10  founder_age                  100 non-null    object 
 11  founder_education            100 non-null    object 
 12  previous_startup_experience  80 non-null     object 
 13  founder_experience_ye

In [5]:
data.shape

(100, 21)

In [6]:
# prompt: write a python code that make augmentation for the 100 row of this dataset convert it to 500 rows

# Assuming 'data' is your pandas DataFrame loaded from the Excel file
# Let's focus on augmenting the first 100 rows to generate 400 new rows (100 + 400 = 500 total)

# Select the first 100 rows
rows_to_augment = data.head(100).copy()

# Function to perform simple augmentation
# This is a placeholder function. You should customize this based on your data and the type of augmentation needed.
# For numerical columns, you could add small random noise.
# For categorical columns, you could randomly swap values or duplicate rows.
# This example just duplicates rows with some random value addition for simplicity on numerical columns.
def augment_row(row):
  new_row = row.copy()
  # Example: Add small random noise to numerical columns
  for col in new_row.index:
    # Check if the value is numeric before attempting augmentation
    if pd.api.types.is_numeric_dtype(new_row[col]):
        try:
            # Convert to numeric, coercing errors to NaN
            numeric_value = pd.to_numeric(new_row[col], errors='coerce')
            if not pd.isna(numeric_value):
                new_row[col] = numeric_value + np.random.normal(0, abs(numeric_value)*0.05) # Add up to 5% random noise
        except:
            # If conversion fails or any other error occurs, skip augmentation for this value
            pass
  return new_row

augmented_rows_list = []

# Augment each of the first 100 rows multiple times to reach the target of 500 rows.
# We need 400 additional rows. We'll augment each of the initial 100 rows 4 times.
num_augmentations_per_row = 4

for index, row in rows_to_augment.iterrows():
  for _ in range(num_augmentations_per_row):
    augmented_row_data = augment_row(row)
    augmented_rows_list.append(augmented_row_data)

# Create a DataFrame from the augmented rows
augmented_data_df = pd.DataFrame(augmented_rows_list)

# Combine the original data with the augmented data
# We only need the original rows that were not augmented (from row 101 onwards)
remaining_original_data = data.iloc[100:].copy()

# Concatenate the augmented data with the remaining original data
# Reset the index of the augmented data to avoid duplicate index values
augmented_data_df = augmented_data_df.reset_index(drop=True)

# Combine the original 100 rows, the remaining original rows, and the augmented rows
# This approach keeps the first 100 original rows and adds the 400 augmented rows,
# and also keeps any original rows beyond the first 100.
# To get exactly 500 rows (100 original + 400 augmented), we should just combine
# the original first 100 rows with the augmented data.

combined_data = pd.concat([rows_to_augment, augmented_data_df], ignore_index=True)

print("Original number of rows:", data.shape[0])
print("Number of rows after augmentation:", combined_data.shape[0])
print(combined_data.head())
print(combined_data.tail())

Original number of rows: 100
Number of rows after augmentation: 500
  startup_name founded_year   city        industry  has_online_presence  \
0    MNT-Halan         2017  Cairo         Fintech                    1   
1       Paymob         2015  Cairo         Fintech                    1   
2        Bosta        2017?  Cairo       Logistics                    1   
3         Swvl         2017  Cairo  Transportation                    1   
4    Breadfast         2017  Cairo        F&B/Ecom                    1   

  num_founders num_employees initial_funding got_external_funding  \
0            3          500+        30e3 USD                    1   
1            3          200+               —                    1   
2            2          300+               —                    1   
3            3           600        30e3 USD                    1   
4            3          150+               —                    1   

  startup_stage  ... founder_education previous_startup_experience

In [7]:
# prompt: download the augmented dataset genetared before in csv file called 'augmented_dataset'

from google.colab import files

# Save the augmented dataframe to a CSV file
combined_data.to_csv('augmented_dataset.csv', index=False)

# Download the CSV file
files.download('augmented_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

***Data Preprocessing***

In [52]:
df = pd.read_csv('/content/augmented_dataset.csv')

In [53]:
df.head()

Unnamed: 0,startup_name,founded_year,city,industry,has_online_presence,num_founders,num_employees,initial_funding,got_external_funding,startup_stage,...,founder_education,previous_startup_experience,founder_experience_years,market_competition_level,incubator_support,num_direct_competitors,market_need_level,is_successful,years_to_failure,made_profit_in_3y
0,MNT-Halan,2017,Cairo,Fintech,1,3,500+,30e3 USD,1,Market,...,—,—,—,High,1.0,Many,Strong,1,—,1
1,Paymob,2015,Cairo,Fintech,1,3,200+,—,1,Market,...,—,—,—,High,1.0,Many,Strong,1,—,1
2,Bosta,2017?,Cairo,Logistics,1,2,300+,—,1,Market,...,—,—,—,High,1.0,Many,Strong,1,—,1
3,Swvl,2017,Cairo,Transportation,1,3,600,30e3 USD,1,Market,...,—,—,—,High,1.0,Many,Strong,1,—,1
4,Breadfast,2017,Cairo,F&B/Ecom,1,3,150+,—,1,Market,...,—,—,—,Medium,1.0,Many,Strong,1,—,1


In [54]:
for col in df.columns:
  print (col)
  print(df[col].unique())
  print (10*'-')

startup_name
['MNT-Halan' 'Paymob' 'Bosta' 'Swvl' 'Breadfast' 'Paysky' 'Yaoota'
 'CardoO' 'Visit Ventures' 'Widebot' 'Jarayed' 'Pluto' 'Brimore'
 'La Reina' 'Milezmore' 'Capiter' 'Bkam' 'Sarcastech' 'Sico (Nile X)'
 'Dawayer' 'Petroleum Air Services (P.A.S.)' 'Cook Door (branch)'
 'Suez Canal Container Terminal'
 'Port Said Chamber of Shipping initiative' 'Le Planneur'
 '(Hypothetical) Café X' '(Hypothetical) Delivery Y'
 '(Hypothetical) Co‑working Z' 'CanalEats' 'SinaiRoast' 'CanalCraft'
 'AgroIsmailia' 'TechValley' 'ValleyTech' 'CraftSar' 'FoodHub'
 'CanalTours' 'AgroFail' 'CanalShop' 'SuezTaste' 'MaritimeLog' 'SuezServ'
 'PortTech' 'FoodCorner' 'SuezCoWork' 'CanalClean' 'Portaly' 'AgroSuez'
 'DeltaFishTech' 'NileWoodCraft' 'DamiettaCakes' 'CanalFurniture'
 'SeaShellGallery' 'DamiettaCafeX' 'FishDeliver' 'CraftFailed'
 'DeltaAgroTech' 'NileFurnish' 'AsuitAgroTech' 'SohagSolar' 'LuxorCrafts'
 'QenaFoodDelivery' 'AswanTours' 'MinyaTextiles' 'NileStone' 'AsyutCare'
 'SohagEducate' 'Luxo

In [55]:
df = df.drop(['startup_name','founder_age'], axis=1)

In [56]:
df = df.drop(['founder_experience_years','previous_startup_experience'], axis=1)

In [57]:
df.shape

(500, 17)

In [58]:
#percentage of each unique val in founder_education

df['founder_education'].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
founder_education,Unnamed: 1_level_1
—,80.0
0,13.0
1,7.0


In [59]:
df = df.drop(['founder_education'], axis=1)

In [60]:
#remove question marks from the founded year column

df['founded_year'] = df['founded_year'].astype(str).str.replace('?', '', regex=False)
df['founded_year'] = pd.to_numeric(df['founded_year'], errors='coerce')
print(df['founded_year'].unique())

[2017 2015 2014 2018 2019 2016 2011 2021 2020 2012 2024 1982 1988 2000
 2009 1997]


***Clean City column***

In [61]:
print (df['city'].unique())

['Cairo' 'Alexandria' 'Cairo/UK' 'Assiut' 'Port Said' 'Port Said?'
 'Ismailia' 'Suez' 'Damietta' 'Asyut' 'Sohag' 'Luxor' 'Qena' 'Aswan'
 'Minya' 'Aswin' 'online']


In [62]:
# convert Cairo/UK to Cairo and convert Asyut to Assiut and convert Aswin to Aswan

df['city'] = df['city'].str.replace('/UK', '', regex=False)
df['city'] = df['city'].str.replace(r'Port Said?', 'Portsaid', regex=False)
df['city'] = df['city'].str.replace(r'Port Said', 'Portsaid', regex=False)
df['city'] = df['city'].str.replace('Asyut', 'Assiut', regex=False)
df['city'] = df['city'].str.replace('Aswin', 'Aswan', regex=False)
print (df['city'].unique())

['Cairo' 'Alexandria' 'Assiut' 'Portsaid' 'Ismailia' 'Suez' 'Damietta'
 'Sohag' 'Luxor' 'Qena' 'Aswan' 'Minya' 'online']


Continue...

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   founded_year              500 non-null    int64  
 1   city                      500 non-null    object 
 2   industry                  500 non-null    object 
 3   has_online_presence       500 non-null    int64  
 4   num_founders              500 non-null    object 
 5   num_employees             500 non-null    object 
 6   initial_funding           500 non-null    object 
 7   got_external_funding      500 non-null    object 
 8   startup_stage             500 non-null    object 
 9   market_competition_level  400 non-null    object 
 10  incubator_support         400 non-null    float64
 11  num_direct_competitors    400 non-null    object 
 12  market_need_level         400 non-null    object 
 13  is_successful             400 non-null    object 
 14  years_to_f

***initial_funding column***

In [64]:
#count of unique values in initial_funding column

print(df['initial_funding'].value_counts())

initial_funding
—                440
30e3 USD          10
2.7 M USD          5
660k USD           5
3 M USD            5
>$30M              5
>$1M               5
$5M                5
$33M               5
$0.5M              5
EGP 37M infra      5
~$500k             5
Name: count, dtype: int64


In [65]:
df = df.drop(['initial_funding'], axis=1)

In [66]:
df.shape

(500, 15)

***number of employees column***

In [67]:
# prompt: show the count of unique vals in num_employees column

print(df['num_employees'].value_counts())

num_employees
~10               55
~5                40
~8                35
~4                30
~15               30
~12               25
~~                25
~3                25
~20               25
~50               25
~30               20
~6                20
—                 15
200+              10
~7                10
~25               10
~100              10
~45               10
500+               5
~20 per branch     5
~44                5
~2000              5
40+                5
150+               5
600                5
44                 5
300+               5
37+                5
~35                5
~120               5
~80                5
~60                5
~150               5
~40                5
Name: count, dtype: int64


In [68]:
# prompt: remove '~' and '+' chars in num_employees column

df['num_employees'] = df['num_employees'].astype(str).str.replace('~', '', regex=False)
df['num_employees'] = df['num_employees'].str.replace('+', '', regex=False)
print(df['num_employees'].value_counts())

num_employees
10               55
5                40
8                35
4                30
15               30
50               25
20               25
                 25
3                25
12               25
30               20
6                20
—                15
200              10
150              10
7                10
45               10
44               10
25               10
40               10
100              10
300               5
600               5
2000              5
500               5
20 per branch     5
37                5
35                5
120               5
80                5
60                5
Name: count, dtype: int64


In [69]:
#convert 20 per branch to 20

df['num_employees'] = df['num_employees'].str.replace(' per branch', '', regex=False)
print(df['num_employees'].value_counts())

num_employees
10      55
5       40
8       35
20      30
4       30
15      30
3       25
50      25
        25
12      25
6       20
30      20
—       15
200     10
7       10
25      10
45      10
40      10
150     10
100     10
44      10
300      5
2000     5
500      5
600      5
37       5
35       5
120      5
80       5
60       5
Name: count, dtype: int64


In [70]:
# prompt: fill null values in num_employees column

#fill null values using median

df['num_employees'] = pd.to_numeric(df['num_employees'], errors='coerce')
df['num_employees'] = df['num_employees'].fillna(df['num_employees'].median())
print(df['num_employees'].unique())
df.info()
print(df['num_employees'].value_counts())

[ 500.  200.  300.  600.  150.   40.   12.   44.    5.   10.  100.   50.
 2000.   20.    3.   37.    4.    8.   15.   30.    6.   35.   25.    7.
  120.   80.   60.   45.]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   founded_year              500 non-null    int64  
 1   city                      500 non-null    object 
 2   industry                  500 non-null    object 
 3   has_online_presence       500 non-null    int64  
 4   num_founders              500 non-null    object 
 5   num_employees             500 non-null    float64
 6   got_external_funding      500 non-null    object 
 7   startup_stage             500 non-null    object 
 8   market_competition_level  400 non-null    object 
 9   incubator_support         400 non-null    float64
 10  num_direct_competitors    400 non-null    object 
 11  mar

In [71]:
print(df['num_employees'].value_counts())

num_employees
12.0      65
10.0      55
5.0       40
8.0       35
15.0      30
4.0       30
20.0      30
3.0       25
50.0      25
30.0      20
6.0       20
44.0      10
7.0       10
45.0      10
40.0      10
100.0     10
200.0     10
150.0     10
25.0      10
600.0      5
300.0      5
500.0      5
2000.0     5
37.0       5
35.0       5
120.0      5
80.0       5
60.0       5
Name: count, dtype: int64


In [72]:
# prompt: convert float vals to integer vals

df['num_employees'] = df['num_employees'].astype(int)
print(df['num_employees'].unique())
print(df['num_employees'].value_counts())
df.info()

[ 500  200  300  600  150   40   12   44    5   10  100   50 2000   20
    3   37    4    8   15   30    6   35   25    7  120   80   60   45]
num_employees
12      65
10      55
5       40
8       35
15      30
4       30
20      30
3       25
50      25
30      20
6       20
44      10
7       10
45      10
40      10
100     10
200     10
150     10
25      10
600      5
300      5
500      5
2000     5
37       5
35       5
120      5
80       5
60       5
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   founded_year              500 non-null    int64  
 1   city                      500 non-null    object 
 2   industry                  500 non-null    object 
 3   has_online_presence       500 non-null    int64  
 4   num_founders              500 non-null    object 
 5   num_employees 

***number of founder column***

In [73]:
# show the number of each unique value in the column num_founder

print(df['num_founders'].value_counts())

num_founders
2    210
1    135
3    100
—     40
4     15
Name: count, dtype: int64


In [74]:
#fill the null values in the num_founders column

# Replace '—' with NaN for proper numerical handling
df['num_founders'] = df['num_founders'].replace('—', np.nan)

# Convert the column to numeric, coercing errors to NaN
df['num_founders'] = pd.to_numeric(df['num_founders'], errors='coerce')

# Assuming the missing values are represented as NaN
# We can fill NaNs with a median, mean, or a specific value (like 1 for founder count).
# Using median is generally safer if there are outliers.
# Let's check the distribution first to decide.
print(df['num_founders'].describe())
print(df['num_founders'].median())

# Option 1: Fill NaNs with the median of the column
df['num_founders'].fillna(df['num_founders'].median(), inplace=True)

# Option 2: Fill NaNs with a specific value (e.g., 1, assuming a single founder is a common scenario)
# df['num_founders'].fillna(1, inplace=True)

# Verify that NaNs have been filled
print("\nAfter filling NaNs:")
print(df['num_founders'].value_counts())
print(df['num_founders'].isnull().sum()) # Should be 0

count    460.000000
mean       1.989130
std        0.801613
min        1.000000
25%        1.000000
50%        2.000000
75%        2.250000
max        4.000000
Name: num_founders, dtype: float64
2.0

After filling NaNs:
num_founders
2.0    250
1.0    135
3.0    100
4.0     15
Name: count, dtype: int64
0


***Online presence column***

In [75]:
#percentage of unique vals in the has_online_presence column
df['has_online_presence'].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
has_online_presence,Unnamed: 1_level_1
1,72.0
0,28.0


***Continue ...***

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   founded_year              500 non-null    int64  
 1   city                      500 non-null    object 
 2   industry                  500 non-null    object 
 3   has_online_presence       500 non-null    int64  
 4   num_founders              500 non-null    float64
 5   num_employees             500 non-null    int64  
 6   got_external_funding      500 non-null    object 
 7   startup_stage             500 non-null    object 
 8   market_competition_level  400 non-null    object 
 9   incubator_support         400 non-null    float64
 10  num_direct_competitors    400 non-null    object 
 11  market_need_level         400 non-null    object 
 12  is_successful             400 non-null    object 
 13  years_to_failure          400 non-null    object 
 14  made_profi

***Market competition level***

In [77]:
# prompt: show the counts of market_competition_level column including null vals

print(df['market_competition_level'].value_counts(dropna=False))

market_competition_level
Medium        210
High          160
NaN           100
Low            20
Low–Medium     10
Name: count, dtype: int64


In [78]:
# prompt: fill null vals in this column

# Check the percentage of null values in the 'market_competition_level' column
print("Percentage of null values in 'market_competition_level':")
print(df['market_competition_level'].isnull().sum() / len(df) * 100)

# Based on the value counts and info() call, the null values are represented as NaN (checked by dropna=False in value_counts)
# Fill null values with the mode (most frequent value) as it's a categorical column
# Calculate the mode
mode_competition = df['market_competition_level'].mode()[0]

# Fill the null values
df['market_competition_level'].fillna(mode_competition, inplace=True)

# Verify that null values have been filled
print("\nAfter filling null values in 'market_competition_level':")
print(df['market_competition_level'].value_counts(dropna=False))
print(df['market_competition_level'].isnull().sum()) # Should be 0

Percentage of null values in 'market_competition_level':
20.0

After filling null values in 'market_competition_level':
market_competition_level
Medium        310
High          160
Low            20
Low–Medium     10
Name: count, dtype: int64
0


In [79]:
# prompt: convert Low–Medium to Low

df['market_competition_level'] = df['market_competition_level'].replace('Low–Medium', 'Low')
print(df['market_competition_level'].value_counts(dropna=False))


market_competition_level
Medium    310
High      160
Low        30
Name: count, dtype: int64


***Startup Stage Column***

In [80]:
print(df['startup_stage'].value_counts())

startup_stage
Market           150
Traction         145
Growth            80
Concept           50
Launch            45
Market entry      10
Market?            5
Scale              5
Market entry?      5
Initiative         5
Name: count, dtype: int64


In [81]:
# prompt: convert Concept to Market entry and Launch to Market entry and Initiative to Market entry and Market entry? to Market entry

df['startup_stage'] = df['startup_stage'].replace(['Concept', 'Launch', 'Initiative', 'Market entry?'], 'Market entry')

print(df['startup_stage'].value_counts())

startup_stage
Market          150
Traction        145
Market entry    115
Growth           80
Market?           5
Scale             5
Name: count, dtype: int64


In [82]:
# prompt: convert Market? to Market

df['startup_stage'] = df['startup_stage'].str.replace('?', '', regex=False)
print(df['startup_stage'].value_counts())

startup_stage
Market          155
Traction        145
Market entry    115
Growth           80
Scale             5
Name: count, dtype: int64


In [83]:
# prompt: convert Traction to Growth

df['startup_stage'] = df['startup_stage'].replace('Traction', 'Growth')
print(df['startup_stage'].value_counts())

startup_stage
Growth          225
Market          155
Market entry    115
Scale             5
Name: count, dtype: int64


***external funding column***

In [84]:
# prompt: print the count of unique values of got_external_funding column

print(df['got_external_funding'].value_counts())

got_external_funding
0     320
1     170
0?      5
1?      5
Name: count, dtype: int64


In [85]:
# prompt: remove all '?' marks from got_external_funding column

df['got_external_funding'] = df['got_external_funding'].astype(str).str.replace('?', '', regex=False)
print(df['got_external_funding'].value_counts())

got_external_funding
0    325
1    175
Name: count, dtype: int64


***Continue ...***

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   founded_year              500 non-null    int64  
 1   city                      500 non-null    object 
 2   industry                  500 non-null    object 
 3   has_online_presence       500 non-null    int64  
 4   num_founders              500 non-null    float64
 5   num_employees             500 non-null    int64  
 6   got_external_funding      500 non-null    object 
 7   startup_stage             500 non-null    object 
 8   market_competition_level  500 non-null    object 
 9   incubator_support         400 non-null    float64
 10  num_direct_competitors    400 non-null    object 
 11  market_need_level         400 non-null    object 
 12  is_successful             400 non-null    object 
 13  years_to_failure          400 non-null    object 
 14  made_profi

***is_successful***

In [97]:
print(df['is_successful'].value_counts())

is_successful
0     220
1     165
0?      5
1*      5
0*      5
Name: count, dtype: int64


In [98]:
# prompt: remove '*' and '?' chars from the column

df['is_successful'] = df['is_successful'].astype(str).str.replace('*', '', regex=False)
df['is_successful'] = df['is_successful'].str.replace('?', '', regex=False)
print(df['is_successful'].value_counts())

is_successful
0      230
1      170
nan    100
Name: count, dtype: int64


In [103]:
# prompt: fill the missing val in is_successful with mode

# Assuming the missing values are represented as NaN after the previous cleaning steps
# Calculate the mode of the 'is_successful' column
mode_successful = df['is_successful'].mode()[0]

# Fill null values with the mode
df['is_successful'].fillna(mode_successful, inplace=True)

# Verify that null values have been filled
print("\nAfter filling null values in 'is_successful':")
print(df['is_successful'].value_counts(dropna=False))
print(df['is_successful'].isnull().sum()) # Should be 0


After filling null values in 'is_successful':
is_successful
0      230
1      170
nan    100
Name: count, dtype: int64
0


In [102]:
print(df['is_successful'].value_counts())

is_successful
0      230
1      170
nan    100
Name: count, dtype: int64


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   founded_year              500 non-null    int64  
 1   city                      500 non-null    object 
 2   industry                  500 non-null    object 
 3   has_online_presence       500 non-null    int64  
 4   num_founders              500 non-null    float64
 5   num_employees             500 non-null    int64  
 6   got_external_funding      500 non-null    object 
 7   startup_stage             500 non-null    object 
 8   market_competition_level  500 non-null    object 
 9   incubator_support         500 non-null    int64  
 10  num_direct_competitors    500 non-null    object 
 11  market_need_level         500 non-null    object 
 12  is_successful             500 non-null    object 
 13  years_to_failure          400 non-null    object 
 14  made_profi

***made_profit_in_3y***

In [109]:
print(df['made_profit_in_3y'].value_counts())

made_profit_in_3y
0    255
1    125
?     20
Name: count, dtype: int64


In [110]:
df=df.drop(['made_profit_in_3y'], axis=1)

***years_to_failure***

In [105]:
print(df['years_to_failure'].value_counts())

years_to_failure
—                      195
1                       55
2                       35
3                       25
0.5                     25
0.8                     20
6                       10
2025-01-05 00:00:00     10
2025-01-02 00:00:00     10
4                        5
2025-02-05 00:00:00      5
0.6                      5
Name: count, dtype: int64


In [106]:
df = df.drop(['years_to_failure'], axis=1)

In [108]:
df.shape

(500, 14)

***market_need_level***

In [95]:
print(df['market_need_level'].value_counts())

market_need_level
Medium    220
Strong    145
Low        20
Weak       15
Name: count, dtype: int64


In [96]:
# prompt: fill nan vals with mode in market_need_level column

print(df['market_need_level'].value_counts(dropna=False))
print("Percentage of null values in 'market_need_level':")
print(df['market_need_level'].isnull().sum() / len(df) * 100)

# Calculate the mode
mode_market_need = df['market_need_level'].mode()[0]

# Fill the null values with the mode
df['market_need_level'].fillna(mode_market_need, inplace=True)

# Verify that null values have been filled
print("\nAfter filling null values in 'market_need_level':")
print(df['market_need_level'].value_counts(dropna=False))
print(df['market_need_level'].isnull().sum()) # Should be 0

market_need_level
Medium    220
Strong    145
NaN       100
Low        20
Weak       15
Name: count, dtype: int64
Percentage of null values in 'market_need_level':
20.0

After filling null values in 'market_need_level':
market_need_level
Medium    320
Strong    145
Low        20
Weak       15
Name: count, dtype: int64
0


***num_direct_competitors column***

In [88]:
# prompt: show counts of unique vals in  num_direct_competitors  column

print(df['num_direct_competitors'].value_counts(dropna=False))

num_direct_competitors
Many    185
Some    145
NaN     100
Few      70
Name: count, dtype: int64


In [94]:
# prompt: fill null values with mode in num_direct_competitors column

# Check the percentage of null values in the 'num_direct_competitors' column
print("Percentage of null values in 'num_direct_competitors':")
print(df['num_direct_competitors'].isnull().sum() / len(df) * 100)

# Based on the value counts and info() call, the null values are represented as NaN (checked by dropna=False in value_counts)
# Fill null values with the mode (most frequent value) as it's likely a discrete numerical column
# Calculate the mode
mode_competitors = df['num_direct_competitors'].mode()[0]

# Fill the null values
df['num_direct_competitors'].fillna(mode_competitors, inplace=True)

# Verify that null values have been filled
print("\nAfter filling null values in 'num_direct_competitors':")
print(df['num_direct_competitors'].value_counts(dropna=False))
print(df['num_direct_competitors'].isnull().sum()) # Should be 0

Percentage of null values in 'num_direct_competitors':
0.0

After filling null values in 'num_direct_competitors':
num_direct_competitors
Many    285
Some    145
Few      70
Name: count, dtype: int64
0


***Incubator Support Column***

In [89]:
print(df['incubator_support'].value_counts())

incubator_support
0.0    335
1.0     65
Name: count, dtype: int64


In [90]:
print(df['incubator_support'].unique())

[ 1.  0. nan]


In [91]:
# prompt: fill nan values with mode

# Based on the value counts and info() call, the null values are represented as NaN (checked by dropna=False in value_counts)
# Fill null values with the mode (most frequent value) as it's a categorical column
# Calculate the mode
mode_incubator = df['incubator_support'].mode()[0]

# Fill the null values
df['incubator_support'].fillna(mode_incubator, inplace=True)

# Verify that null values have been filled
print("\nAfter filling null values in 'incubator_support':")
print(df['incubator_support'].value_counts(dropna=False))
print(df['incubator_support'].isnull().sum()) # Should be 0


After filling null values in 'incubator_support':
incubator_support
0.0    435
1.0     65
Name: count, dtype: int64
0


In [92]:
# prompt: convert float vals to int vals in incubator_support column

df['incubator_support'] = df['incubator_support'].astype(int)
print(df['incubator_support'].value_counts())
print(df['incubator_support'].unique())

incubator_support
0    435
1     65
Name: count, dtype: int64
[1 0]


***check data after cleaning***

In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   founded_year              500 non-null    int64  
 1   city                      500 non-null    object 
 2   industry                  500 non-null    object 
 3   has_online_presence       500 non-null    int64  
 4   num_founders              500 non-null    float64
 5   num_employees             500 non-null    int64  
 6   got_external_funding      500 non-null    object 
 7   startup_stage             500 non-null    object 
 8   market_competition_level  500 non-null    object 
 9   incubator_support         500 non-null    int64  
 10  num_direct_competitors    500 non-null    object 
 11  market_need_level         500 non-null    object 
 12  is_successful             500 non-null    object 
dtypes: float64(1), int64(4), object(8)
memory usage: 50.9+ KB


In [112]:
# prompt: print unique values in each column

for col in df.columns:
  print(f"Unique values in column '{col}':")
  print(df[col].unique())
  print("-" * 20)


Unique values in column 'founded_year':
[2017 2015 2014 2018 2019 2016 2011 2021 2020 2012 2024 1982 1988 2000
 2009 1997]
--------------------
Unique values in column 'city':
['Cairo' 'Alexandria' 'Assiut' 'Portsaid' 'Ismailia' 'Suez' 'Damietta'
 'Sohag' 'Luxor' 'Qena' 'Aswan' 'Minya' 'online']
--------------------
Unique values in column 'industry':
['Fintech' 'Logistics' 'Transportation' 'F&B/Ecom' 'E‑commerce' 'IoT'
 'Travel Tech' 'AI/NLP' 'Delivery' 'Co‑working' 'Social commerce'
 'Logistics/Cloud' 'B2B e‑commerce' 'Price comparison'
 'Entertainment Portal' 'Hardware/mobile' 'Arts & Culture' 'Aviation'
 'Fast Food' 'Logistics/Port' 'Maritime services' 'Delivery (Print)'
 'Events Management' 'Café / F&B' 'Food Delivery' 'Co‑working Space'
 'Restaurant' 'Art & Crafts' 'Agri-business' 'Industrial Tech'
 'Tech Valley' 'Handicrafts shop' 'Restaurant chain' 'Tourism services'
 'E-commerce' 'Food & Beverage' 'Home Services' 'Co-working'
 'Cleaning Service' 'Port Analytics' 'Aquaculture T

In [113]:
# prompt: save the df in csv file called 'cleaned_augmented_success_prediction_dataset' and download it

# Save the cleaned DataFrame to a CSV file
df.to_csv('cleaned_augmented_success_prediction_dataset.csv', index=False)

# Download the CSV file
files.download('cleaned_augmented_success_prediction_dataset.csv')