In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In our previous data processing, we organized the data as follows: For each startup with a calculable company value, we identified the corresponding founder and arranged the founder's work experience chronologically by a defined index. The last index for each founding experience represents the startup we are investigating, along with its corresponding company value. The details can be found in the 'Define company value and Data filtering.ipynb'.

We also utilize the verticals that prediced by OnevsRest, which is the'class_name' column in the following dataframe.

In [None]:
import pandas as pd
career = pd.read_csv('./drive/MyDrive/Capstone/7.11_data.csv')

In [None]:
career

Unnamed: 0.1,Unnamed: 0,CareerID,FounderID,CompanyID,JobTitle,DateRange,Start Date,End Date,Duration (years),Location,Description,Created Date,Relevant,founded_company_value,Headquarters Location,Number Of Employees,index,class_name
0,0,8,1,53036,Vise President,2005–2009,2005-03-01,2009-09-01,4.5,,,2021-02-19,False,,,,0,
1,1,7,1,53036,Executive Director,2007–2009,2007-03-01,2009-09-01,2.5,"Moscow, Russian Federation",,2021-02-19,False,,,,1,
2,2,216950,1,19921,Founder,01/2016-Present,2016-01-01,,6.4,Cyprus,We are the largest global \ntravel mobility ma...,2022-05-05,True,22450000.0,European Union (EU),,2,"Carsharing,Micro-mobility,Mobility tech,Ridesh..."
3,3,80467,3,27310,Lead Accountant,Nov 2001–Apr 2006,2001-11-01,2006-04-01,4.4,,,2021-03-10,False,,,,0,
4,4,80414,3,39824,"Manager of Finance and Accounting, CIS Region",May 2006–Oct 2008,2006-05-01,2008-10-01,2.4,,,2021-03-10,False,,,,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11335,11335,238807,25746,127266,Chief Technical Officer | co-founder,10/2015-12/2017,2015-10-01,2017-12-01,2.2,"Toronto, Canada Area",,2022-06-17,True,12500000.0,"Toronto, Ontario",11-50,4,HRtech
11336,11336,312809,25746,152402,Product Strategist,1/2017-9/2018,2017-01-01,2018-09-01,1.7,,,2023-03-29,False,,,,5,
11337,11337,238804,25746,127266,Chief Product Officer | co-founder,12/2017-01/2019,2017-12-01,2019-01-01,1.1,"Toronto, Canada Area",,2022-06-17,True,12500000.0,"Toronto, Ontario",11-50,6,HRtech
11338,11338,312807,25746,152400,Chief Executive Officer | co-founder,12/2018-3/2021,2018-12-01,2021-03-01,2.2,,Inkblot was acquired by Green Shield Holdings ...,2023-03-29,True,3808640.0,,,7,


## Basic variables calculation


Here we use a function to calculate basic variables, which are 'Number of professional experience','Number of companies that worked for', 'Total Work Years','Industry Experience', 'Management experience','Entrepreneurial Experience', 'Technical experience', 'Location' and 'Vertical' for each founding experience.

In [None]:
# Create a new column 'group' where each group starts with index = 0 (created in the last step)
# The index = 0 indicates that start of one specific founding experience
career['group'] = (career.iloc[:, 16] == 0).cumsum()

# Group the dataframe by the new 'group' column
grouped = career.groupby('group')

result_dfs = []

for founder_id, group in grouped:
    # The last row in the group is assumed to be the startup experience
    startup_row = group.iloc[-1]
    # All rows except the last one are assumed to be work experience
    work_experience = group.iloc[:-1]

    # Extract the founder ID
    founder_id = startup_row['FounderID']
    # Extract the company ID
    company_id = startup_row['CompanyID']
    # Extract the career ID
    career_id = startup_row['CareerID']
    # Extract the location
    location = startup_row['Location']
    # Extract the vertical
    vertical = startup_row['class_name']

    # Calculate the total years of work experience
    total_years = work_experience['Duration (years)'].sum()

    # Count the number of professional experiences
    num_professional_experience = len(work_experience)

    # Count the number of unique companies worked for
    num_worked_companies = work_experience['CompanyID'].nunique()

    # Calculate the vertical-specific work experience
    industry_experience = work_experience[work_experience['class_name'] == startup_row['class_name']]['Duration (years)'].sum()

    # Define keywords for management positions
    management_keywords = [
      "aufsichtsrat", "board", "cco", "ceo", "cfo", "chairman", "chairwoman", "chief", "cmo", "cpo",
      "cro", "cto", "director", "executive", "general director", "general manager", "geschaeftsfuehrer",
      "head", "lead", "partner", "projektleiter", "president", "verwaltungsrat", "vorstand", "vr"]
    # Calculate the management experience
    group_mask_management = work_experience['JobTitle'].str.contains('|'.join(management_keywords), case=False, na=False)
    management_experience = work_experience[group_mask_management]['Duration (years)'].sum()

    # Define keywords for entrepreneurial positions
    entrepreneurial_keywords = [
        "angel", "cofounder", "founder", "partner", "gruender", "inhaber", "initiator",
        "investor", "mitgruender", "owner",  "co-founder", 'entrepreneur']
    # Calculate the entrepreneurial experience
    group_mask_entrepreneurial = work_experience['JobTitle'].str.contains('|'.join(entrepreneurial_keywords), case=False, na=False)
    entrepreneurial_experience = work_experience[group_mask_entrepreneurial]['Duration (years)'].sum()

    # Define keywords for technical positions
    technical_keywords = [
        "architect", "intelligence", "computer", "cto", "data", "developer", "development" ,'scientist', 'architect',
        "engineer", "engineering", "programmer", "system", "technical", "technology", 'economist', 'researcher'
    ]
    # Calculate the technical experience
    group_mask_technical = work_experience['JobTitle'].str.contains('|'.join(technical_keywords), case=False, na=False)
    technical_experience = work_experience[group_mask_technical]['Duration (years)'].sum()

    # Extract the value of the founded company
    founded_company_value = startup_row['founded_company_value']

    # Build the result dataframe for the current group
    result_df = pd.DataFrame({
        'FounderID': [founder_id],
        'CompanyID': [company_id],
        'CareerID': [career_id],
        'Number of professional experience': [num_professional_experience],
        'Number of companies that worked for': [num_worked_companies],
        'Total Work Years': [total_years],
        'Industry Experience': [industry_experience],
        'Management experience': [management_experience],
        'Entrepreneurial Experience': [entrepreneurial_experience],
        'Technical experience': [technical_experience],
        'Founded Company Value': [founded_company_value],
        'Location':[location],
        'Vertical':[vertical]
    })

    result_dfs.append(result_df)

# Concatenate all the result dataframes into one final dataframe
final_result_df = pd.concat(result_dfs, ignore_index=True)

final_result_df.head(20)

Unnamed: 0,FounderID,CompanyID,CareerID,Number of professional experience,Number of companies that worked for,Total Work Years,Industry Experience,Management experience,Entrepreneurial Experience,Technical experience,Founded Company Value,Location,Vertical
0,1,19921,216950,2,1,7.0,0.0,7.0,0.0,2.5,22450000.0,Cyprus,"Carsharing,Micro-mobility,Mobility tech,Ridesh..."
1,3,17160,79402,8,8,31.1,0.0,19.6,9.9,0.0,11852460.0,,Cryptocurrency and blockchain
2,6,19921,39,7,6,18.9,0.0,10.3,0.0,6.7,22450000.0,World,"Carsharing,Micro-mobility,Mobility tech,Ridesh..."
3,13,20739,107,3,3,7.1,0.0,0.0,0.0,4.1,350410.0,"London, United Kingdom",Foodtech
4,28,50729,223,7,7,7.2,0.0,5.5,0.0,0.0,30000000.0,,Cryptocurrency and blockchain
5,48,33123,343,0,0,0.0,0.0,0.0,0.0,0.0,6500000.0,,Foodtech
6,58,33123,391,0,0,0.0,0.0,0.0,0.0,0.0,6500000.0,Moscow,Foodtech
7,63,16427,429,7,6,6.0,0.0,2.3,0.0,0.2,1825000.0,"Istanbul, Turkey","Carsharing,Micro-mobility,Mobility tech,Ridesh..."
8,81,53981,87155,10,9,15.6,0.0,2.3,0.0,1.5,730580.0,"Istanbul, Turkey","Carsharing,Micro-mobility,Mobility tech,Ridesh..."
9,83,40324,566,12,7,28.9,0.0,13.3,0.0,13.2,3637125.0,"Istanbul, Turkey",HRtech


In [None]:
final_result_df.shape

(1801, 15)

## Location Cleaning

The text in the location variable contains numerous issues, such as various symbols, inconsistent capitalization, and many non-English values. Therefore, we need to preprocess, translate, and clean this variable. The specific steps are as follows:

1. preprocessing_text: First, convert all text to lowercase and remove content within square brackets, links, HTML tags, punctuation, newline characters, and email addresses. This step ensures consistency and standardization of the text.

2. translation: Since many values are not in English, we use the googletrans library to translate the text into English, ensuring all text is in English for subsequent processing and analysis.

3. text_clean: After preprocessing and translation, we further clean the text to ensure the final text is properly formatted and free of extraneous special characters.

**Data Translating**

In [None]:
import re
import string

def preprocess_text(text):
    '''Make text lowercase, remove text in square brackets, remove links, remove punctuation
    and remove words containing numbers.'''
    '''remove email'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\S+@\S+', '', text)

    return text

#5.translate
def translate(text):
  translator = Translator(service_urls=['translate.googleapis.com'])
  translated_text=translator.translate(text, dest='en').text
  return translated_text

def text_clean(text):
    remove_special = preprocess_text(text)
    if remove_special=='':
      tran_text = remove_special
    else:
      tran_text = translate(remove_special)
    return tran_text

In [None]:
!pip install googletrans==3.1.0a0

Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==3.1.0a0)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading hstspreload-2024.7.1-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==3.1.0a0)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━

In [None]:
from googletrans import Translator
# Apply function to the location variable
final_result_df["Location"] = final_result_df["Location"].map(preprocess_text)
final_result_df["Location"] = final_result_df["Location"].map(translate)
final_result_df["Location"] = final_result_df["Location"].map(text_clean)

In [None]:
# Distribution of the counts of location
location_counts = final_result_df['Location'].value_counts()

print(location_counts[:50])

Location
in                                              473
london united kingdom                           165
san francisco bay area                           78
london england united kingdom                    74
greater new york city area                       55
singapore                                        28
london                                           18
paris area france                                16
berlin germany                                   16
new york united states                           14
berlin area germany                              14
greater los angeles area                         13
barcelona area spain                             10
stockholm sweden                                 10
san francisco                                     9
toronto canada area                               9
new york new york united states                   8
los angeles california united states              8
new delhi area india                              8
the

**Define Location Name**

After standardizing the text and translating non-English text, we found that the 'location' data contains repeated locations, such as 'london united kingdom', 'london england united kingdom', and 'london'. To group these repeated locations together, we use the clean_location function.

In [None]:
import re

def clean_location(location):
    if re.search(r'\blondon\b', location, re.IGNORECASE):
        return 'london'
    elif re.search(r'\bnew york\b|\bnyc\b', location, re.IGNORECASE):
        return 'new york'
    elif re.search(r'\bsan francisco\b', location, re.IGNORECASE):
        return 'san francisco'
    elif re.search(r'\bsingapore\b', location, re.IGNORECASE):
        return 'singapore'
    elif re.search(r'\bwashington\b', location, re.IGNORECASE):
        return 'washington'
    elif re.search(r'\bbarcelona\b', location, re.IGNORECASE):
        return 'barcelona'
    elif re.search(r'\bparis\b', location, re.IGNORECASE):
        return 'paris'
    elif re.search(r'\blos angeles\b', location, re.IGNORECASE):
        return 'los angeles'
    elif re.search(r'\bberlin\b', location, re.IGNORECASE):
        return 'berlin'
    elif re.search(r'\bstockholm\b', location, re.IGNORECASE):
        return 'stockholm'
    elif re.search(r'\bmilan\b', location, re.IGNORECASE):
        return 'milan'
    elif re.search(r'\bboston\b', location, re.IGNORECASE):
        return 'boston'
    elif re.search(r'\bmumbai\b', location, re.IGNORECASE):
        return 'mumbai'
    elif re.search(r'\bmadrid\b', location, re.IGNORECASE):
        return 'madrid'
    elif re.search(r'\bcopenhagen\b', location, re.IGNORECASE):
        return 'copenhagen'
    elif re.search(r'\bamsterdam\b', location, re.IGNORECASE):
        return 'amsterdam'
    elif re.search(r'\bnew delhi\b', location, re.IGNORECASE):
        return 'new delhi'
    elif re.search(r'\bhong kong\b', location, re.IGNORECASE):
        return 'hong kong'
    elif re.search(r'\bbrussels\b', location, re.IGNORECASE):
        return 'brussels'
    elif re.search(r'\bmunich\b', location, re.IGNORECASE):
        return 'munich'
    elif re.search(r'\bmexico\b', location, re.IGNORECASE):
        return 'mexico'
    elif re.search(r'\bchicago\b', location, re.IGNORECASE):
        return 'chicago'
    elif re.search(r'\boxford\b', location, re.IGNORECASE):
        return 'oxford'
    elif re.search(r'\bistanbul\b', location, re.IGNORECASE):
        return 'istanbul'
    elif re.search(r'\bSão Paulo\b|\bSao Paulo\b', location, re.IGNORECASE):
        return 'sao paulo'
    elif re.search(r'\bhelsinki\b', location, re.IGNORECASE):
        return 'helsinki'
    elif re.search(r'\bvienna\b', location, re.IGNORECASE):
        return 'vienna'
    elif re.search(r'\bmoscow\b', location, re.IGNORECASE):
        return 'moscow'
    elif re.search(r'\btoronto\b', location, re.IGNORECASE):
        return 'toronto'
    elif re.search(r'\bsan diego\b', location, re.IGNORECASE):
        return 'san diego'
    elif re.search(r'\bcambridge\b', location, re.IGNORECASE):
        return 'cambridge'
    elif re.search(r'\bdubai\b', location, re.IGNORECASE):
        return 'dubai'
    elif re.search(r'\boslo\b', location, re.IGNORECASE):
        return 'oslo'
    elif re.search(r'\bzürich\b|\bzurich\b', location, re.IGNORECASE):
        return 'zurich'
    elif re.search(r'\bnairobi\b', location, re.IGNORECASE):
        return 'nairobi'
    elif re.search(r'\bfrankfurt\b', location, re.IGNORECASE):
        return 'frankfurt'
    elif re.search(r'\blisbon\b', location, re.IGNORECASE):
        return 'lisbon'
    elif re.search(r'\bbeijing\b', location, re.IGNORECASE):
        return 'beijing'
    elif re.search(r'\bmontreal\b', location, re.IGNORECASE):
        return 'montreal'
    elif re.search(r'\bshanghai\b', location, re.IGNORECASE):
        return 'shanghai'
    elif re.search(r'\bKuala Lumpur\b', location, re.IGNORECASE):
        return 'kuala lumpur'
    elif re.search(r'\bbengaluru\b', location, re.IGNORECASE):
        return 'bengaluru'
    elif re.search(r'\bphiladelphia\b', location, re.IGNORECASE):
        return 'philadelphia'
    elif re.search(r'\bgreece\b', location, re.IGNORECASE):
        return 'greece'
    elif re.search(r'\bsydney\b', location, re.IGNORECASE):
        return 'sydney'
    elif re.search(r'\bseattle\b', location, re.IGNORECASE):
        return 'seattle'
    elif re.search(r'\bprague\b', location, re.IGNORECASE):
        return 'prague'
    elif re.search(r'\bgurgaon\b', location, re.IGNORECASE):
        return 'gurgaon'
    elif re.search(r'\blagos\b', location, re.IGNORECASE):
        return 'lagos'
    else:
        return location

In [None]:
# Apply function to the location variable
final_result_df['Location'] = final_result_df['Location'].apply(clean_location)

In [None]:
# Distribution of the counts of cleaned location
location_counts = final_result_df['Location'].value_counts()

print(location_counts[:50])

Location
in                             473
london                         302
new york                       117
san francisco                  105
berlin                          55
paris                           33
singapore                       32
los angeles                     24
barcelona                       21
boston                          18
mexico                          18
toronto                         17
munich                          16
milan                           16
stockholm                       15
oslo                            15
chicago                         15
sao paulo                       15
mumbai                          14
copenhagen                      14
madrid                          13
amsterdam                       12
new delhi                       12
cambridge                       11
bengaluru                       10
kuala lumpur                     9
san diego                        8
dubai                            8
zurich     

In [None]:
# Define the list of cities with frequency larger than 5
cities = [
    'london', 'new york', 'san francisco', 'berlin', 'paris',
    'singapore', 'los angeles', 'barcelona', 'boston', 'mexico',
    'toronto', 'munich', 'milan', 'stockholm', 'oslo', 'chicago',
    'sao paulo', 'mumbai', 'copenhagen', 'madrid', 'amsterdam',
    'new delhi', 'cambridge', 'bengaluru', 'kuala lumpur', 'san diego',
    'dubai', 'zurich', 'hong kong', 'istanbul', 'washington', 'seattle',
    'vienna', 'nairobi', 'gurgaon', 'greece', 'beijing', 'oxford',
    'united states'
]

In [None]:
# Cities with frequency smaller than 5 are classfied to others
final_result_df['Location'] = final_result_df['Location'].apply(lambda x: x if x in cities else 'other')

In [None]:
final_result_df['Location'].nunique()

40

## Job Title and Speed of growth

In this section, we utilize the information in the job title and define the speed of growth.

### Job Title Cleaning

In [None]:
!pip install langdetect
!pip install googletrans==3.1.0a0
from googletrans import Translator
from langdetect import detect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/981.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m614.4/981.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=b2b2fb018e09ed

In [None]:
# Data Cleaning
career.dropna(subset=['JobTitle'], inplace=True)
career['JobTitle'] = career['JobTitle'].str.lower()

import re
def remove_punctuation_and_replace_with_space(text):
  text = re.sub(r'[^\w\s-]', ' ', text)
  return re.sub(r'\s+', ' ', text).strip()


career['JobTitle'] = career['JobTitle'].apply(remove_punctuation_and_replace_with_space)


In [None]:
def translate(text):
  translator = Translator(service_urls=['translate.googleapis.com'])
  translated_text=translator.translate(text, dest='en').text
  return translated_text

career['JobTitle'] = career['JobTitle'].apply(translate)

### Creat Job Level Dictionary

In [None]:
career.dropna(subset=['JobTitle'], inplace=True)
career['JobTitle'] = career['JobTitle'].str.replace('-', ' ')
career['JobTitle'] = career['JobTitle'].str.lower()

In [None]:
# Words Frequency
from collections import Counter

def count_word_frequencies(df, column_name):
    word_counts = Counter()
    for title in df[column_name]:
        words = title.split()
        word_counts.update(words)
    return word_counts

career_word_counts = count_word_frequencies(career, 'JobTitle')

career_word_counts.most_common()

[('founder', 2804),
 ('co', 1936),
 ('and', 1005),
 ('ceo', 977),
 ('director', 912),
 ('manager', 899),
 ('of', 805),
 ('analyst', 706),
 ('intern', 669),
 ('consultant', 591),
 ('business', 549),
 ('associate', 545),
 ('officer', 465),
 ('development', 448),
 ('chief', 448),
 ('board', 405),
 ('senior', 397),
 ('product', 373),
 ('member', 361),
 ('head', 354),
 ('executive', 347),
 ('president', 338),
 ('marketing', 323),
 ('research', 291),
 ('investment', 286),
 ('advisor', 271),
 ('managing', 265),
 ('strategy', 263),
 ('assistant', 260),
 ('partner', 256),
 ('summer', 253),
 ('project', 219),
 ('management', 216),
 ('sales', 214),
 ('vice', 209),
 ('coo', 196),
 ('the', 196),
 ('finance', 188),
 ('investor', 182),
 ('global', 174),
 ('banking', 156),
 ('operations', 153),
 ('corporate', 147),
 ('chairman', 144),
 ('lead', 134),
 ('in', 130),
 ('financial', 129),
 ('equity', 117),
 ('engineer', 105),
 ('fellow', 104),
 ('vp', 101),
 ('at', 98),
 ('a', 98),
 ('cfo', 93),
 ('intern

In [None]:
#Creat a dictionary

structure = {
    'level 0': ['intern', 'trainee', 'graduate', 'volunteer', 'student', 'candidate'],
    'level 1': ['staff', 'assistant', 'analyst', 'junior', 'fellow', 'reseller','coordinator', 'member', 'sales', 'trader', 'investor', 'developer', 'editor', 'designer', 'producer', 'banker', 'journalist', 'reporter', 'writer', 'auditor', 'clerk', 'blogger', 'artist', 'employee', 'starter', 'bellman', 'waiter', 'kitchen-hand', 'waitress', 'telemarketer', 'filler', 'runner', 'handyman'],
    'level 2': ['senior', 'consultant', 'specialist', 'advisor', 'associate', 'lead', 'leader', 'mentor', 'coach', 'expert', 'economist', 'trainer', 'engineer', 'researcher', 'scientist', 'lecturer', 'tutor', 'teacher', 'lawyer', 'adviser', 'organizer', 'solicitor', 'curator', 'operator', 'architect', 'representative', 'accountant', 'attorney', 'host', 'speaker', 'counsel', 'recruiter', 'creator', 'translator', 'contracted', 'broker', 'analytics', 'publicist', 'presenter'],
    'level 3': ['manager', 'supervisor', 'ambassador', 'strategist', 'instructor', 'administrator', 'professor', 'minister', 'commander', 'officer', 'lieutenant', 'captain', 'jumpmaster', 'handler','manger', 'superintendent', 'store', 'dir','executive', 'anchor', 'shareholder'],
    'level 4': ['director', 'senior manager', 'head', 'vice president', 'executive director', 'controller', 'officer', 'dir'],
    'level 5': ['partner', 'owner', 'chief', 'ceo', 'coo', 'cfo', 'cto', 'president', 'principal', 'entrepreneur', 'co-founder', 'founder', 'board member', 'chairman', 'co-owner', 'chair', 'governor', 'chro', 'chef', 'mp', 'exec', 'cco']
}


flat_structure = {title: level for level, titles in structure.items() for title in titles}
flat_structure

{'intern': 'level 0',
 'trainee': 'level 0',
 'graduate': 'level 0',
 'volunteer': 'level 0',
 'student': 'level 0',
 'candidate': 'level 0',
 'staff': 'level 1',
 'assistant': 'level 1',
 'analyst': 'level 1',
 'junior': 'level 1',
 'fellow': 'level 1',
 'reseller': 'level 1',
 'coordinator': 'level 1',
 'member': 'level 1',
 'sales': 'level 1',
 'trader': 'level 1',
 'investor': 'level 1',
 'developer': 'level 1',
 'editor': 'level 1',
 'designer': 'level 1',
 'producer': 'level 1',
 'banker': 'level 1',
 'journalist': 'level 1',
 'reporter': 'level 1',
 'writer': 'level 1',
 'auditor': 'level 1',
 'clerk': 'level 1',
 'blogger': 'level 1',
 'artist': 'level 1',
 'employee': 'level 1',
 'starter': 'level 1',
 'bellman': 'level 1',
 'waiter': 'level 1',
 'kitchen-hand': 'level 1',
 'waitress': 'level 1',
 'telemarketer': 'level 1',
 'filler': 'level 1',
 'runner': 'level 1',
 'handyman': 'level 1',
 'senior': 'level 2',
 'consultant': 'level 2',
 'specialist': 'level 2',
 'advisor': '

In [None]:
priority_structure = [
    ('level 1', structure['level 1']),
    ('level 2', structure['level 2']),
    ('level 3', structure['level 3']),
    ('level 4', structure['level 4']),
    ('level 5', structure['level 5'])
]

def get_level_priority(title):
    if isinstance(title, str):
        for level, titles in priority_structure:
            for key in titles:
                if key in title:
                    return int(level.split()[1])
    return np.nan

In [None]:
# Allocate the level
career['Level'] = career['JobTitle'].apply(get_level_priority)

### Calculate Speed of Growth

In [None]:
import numpy as np

In [None]:
# Deal with null value
for i in range(1, len(career)):
    if np.isnan(career.loc[i, 'Level']):
        if career.loc[i, 'FounderID'] == career.loc[i-1, 'FounderID'] and career.loc[i, 'CompanyID'] == career.loc[i-1, 'CompanyID']:
            career.loc[i, 'Level'] = career.loc[i-1, 'Level'] + 1
        else:
            career.loc[i, 'Level'] = career.loc[i-1, 'Level']

for i in range(len(career)-2, -1, -1):
    if np.isnan(career.loc[i, 'Level']):
        if career.loc[i, 'FounderID'] == career.loc[i+1, 'FounderID'] and career.loc[i, 'CompanyID'] == career.loc[i+1, 'CompanyID']:
            career.loc[i, 'Level'] = career.loc[i+1, 'Level'] - 1
        else:
            career.loc[i, 'Level'] = career.loc[i+1, 'Level']
career


Unnamed: 0,CareerID,FounderID,CompanyID,JobTitle,DateRange,Start Date,End Date,Duration (years),Location,Description,Created Date,Relevant,founded_company_value,Headquarters Location,Number Of Employees,index,Level
0,8,1,53036,vice president,2005–2009,2005-03-01,2009-09-01,4.5,,,2021-02-19,False,,,,0,4.0
1,7,1,53036,executive director,2007–2009,2007-03-01,2009-09-01,2.5,"Moscow, Russian Federation",,2021-02-19,False,,,,1,3.0
2,216950,1,19921,founder,01/2016-Present,2016-01-01,,6.4,Cyprus,We are the largest global \ntravel mobility ma...,2022-05-05,True,22450000.0,European Union (EU),,2,5.0
3,80467,3,27310,lead accountant,Nov 2001–Apr 2006,2001-11-01,2006-04-01,4.4,,,2021-03-10,False,,,,0,2.0
4,80414,3,39824,manager of finance and accounting cis region,May 2006–Oct 2008,2006-05-01,2008-10-01,2.4,,,2021-03-10,False,,,,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11331,238807,25746,127266,chief technical officer co founder,10/2015-12/2017,2015-10-01,2017-12-01,2.2,"Toronto, Canada Area",,2022-06-17,True,12500000.0,"Toronto, Ontario",11-50,4,3.0
11332,312809,25746,152402,product strategist,1/2017-9/2018,2017-01-01,2018-09-01,1.7,,,2023-03-29,False,,,,5,3.0
11333,238804,25746,127266,chief product officer co founder,12/2017-01/2019,2017-12-01,2019-01-01,1.1,"Toronto, Canada Area",,2022-06-17,True,12500000.0,"Toronto, Ontario",11-50,6,3.0
11334,312807,25746,152400,chief executive officer co founder,12/2018-3/2021,2018-12-01,2021-03-01,2.2,,Inkblot was acquired by Green Shield Holdings ...,2023-03-29,True,3808640.0,,,7,3.0


In [None]:
# Calculate difference
career['diff'] = np.nan

for i in range(1, len(career)):
    if i == 0:
        career.loc[i, 'diff'] = np.nan
    else:
        diff = career.loc[i, 'Level'] - career.loc[i-1, 'Level']
        if diff < 0:
            if career.loc[i, 'CompanyID'] == career.loc[i-1, 'CompanyID']:
                career.loc[i, 'diff'] = 1
            else:
                career.loc[i, 'diff'] = 0
        else:
            career.loc[i, 'diff'] = diff

career

Unnamed: 0,CareerID,FounderID,CompanyID,JobTitle,DateRange,Start Date,End Date,Duration (years),Location,Description,Created Date,Relevant,founded_company_value,Headquarters Location,Number Of Employees,index,Level,diff
0,8,1,53036,vice president,2005–2009,2005-03-01,2009-09-01,4.5,,,2021-02-19,False,,,,0,4.0,
1,7,1,53036,executive director,2007–2009,2007-03-01,2009-09-01,2.5,"Moscow, Russian Federation",,2021-02-19,False,,,,1,3.0,1.0
2,216950,1,19921,founder,01/2016-Present,2016-01-01,,6.4,Cyprus,We are the largest global \ntravel mobility ma...,2022-05-05,True,22450000.0,European Union (EU),,2,5.0,2.0
3,80467,3,27310,lead accountant,Nov 2001–Apr 2006,2001-11-01,2006-04-01,4.4,,,2021-03-10,False,,,,0,2.0,0.0
4,80414,3,39824,manager of finance and accounting cis region,May 2006–Oct 2008,2006-05-01,2008-10-01,2.4,,,2021-03-10,False,,,,1,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11331,238807,25746,127266,chief technical officer co founder,10/2015-12/2017,2015-10-01,2017-12-01,2.2,"Toronto, Canada Area",,2022-06-17,True,12500000.0,"Toronto, Ontario",11-50,4,3.0,0.0
11332,312809,25746,152402,product strategist,1/2017-9/2018,2017-01-01,2018-09-01,1.7,,,2023-03-29,False,,,,5,3.0,0.0
11333,238804,25746,127266,chief product officer co founder,12/2017-01/2019,2017-12-01,2019-01-01,1.1,"Toronto, Canada Area",,2022-06-17,True,12500000.0,"Toronto, Ontario",11-50,6,3.0,0.0
11334,312807,25746,152400,chief executive officer co founder,12/2018-3/2021,2018-12-01,2021-03-01,2.2,,Inkblot was acquired by Green Shield Holdings ...,2023-03-29,True,3808640.0,,,7,3.0,0.0


In [None]:
# Calculate speed
career['Speed'] = np.nan

for i in range(1, len(career)):
    if not np.isnan(career.loc[i, 'diff']):
        career.loc[i, 'Speed'] = career.loc[i, 'diff'] / career.loc[i-1, 'Duration (years)']
career['Speed'].replace(np.inf, 0, inplace=True)

  df.loc[i, 'Speed'] = df.loc[i, 'diff'] / df.loc[i-1, 'Duration (years)']
  df.loc[i, 'Speed'] = df.loc[i, 'diff'] / df.loc[i-1, 'Duration (years)']


In [None]:
# Add new index
index_new = []
counter = 0

for i in career['index']:
    if i == 0:
        counter += 1
    index_new.append(counter)

career['index_new'] = index_new
career

Unnamed: 0,CareerID,FounderID,CompanyID,JobTitle,DateRange,Start Date,End Date,Duration (years),Location,Description,Created Date,Relevant,founded_company_value,Headquarters Location,Number Of Employees,index,Level,diff,Speed,index_new
0,8,1,53036,vice president,2005–2009,2005-03-01,2009-09-01,4.5,,,2021-02-19,False,,,,0,4.0,,,1
1,7,1,53036,executive director,2007–2009,2007-03-01,2009-09-01,2.5,"Moscow, Russian Federation",,2021-02-19,False,,,,1,3.0,1.0,0.222222,1
2,216950,1,19921,founder,01/2016-Present,2016-01-01,,6.4,Cyprus,We are the largest global \ntravel mobility ma...,2022-05-05,True,22450000.0,European Union (EU),,2,5.0,2.0,0.800000,1
3,80467,3,27310,lead accountant,Nov 2001–Apr 2006,2001-11-01,2006-04-01,4.4,,,2021-03-10,False,,,,0,2.0,0.0,0.000000,2
4,80414,3,39824,manager of finance and accounting cis region,May 2006–Oct 2008,2006-05-01,2008-10-01,2.4,,,2021-03-10,False,,,,1,3.0,1.0,0.227273,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11331,238807,25746,127266,chief technical officer co founder,10/2015-12/2017,2015-10-01,2017-12-01,2.2,"Toronto, Canada Area",,2022-06-17,True,12500000.0,"Toronto, Ontario",11-50,4,3.0,0.0,0.000000,1823
11332,312809,25746,152402,product strategist,1/2017-9/2018,2017-01-01,2018-09-01,1.7,,,2023-03-29,False,,,,5,3.0,0.0,0.000000,1823
11333,238804,25746,127266,chief product officer co founder,12/2017-01/2019,2017-12-01,2019-01-01,1.1,"Toronto, Canada Area",,2022-06-17,True,12500000.0,"Toronto, Ontario",11-50,6,3.0,0.0,0.000000,1823
11334,312807,25746,152400,chief executive officer co founder,12/2018-3/2021,2018-12-01,2021-03-01,2.2,,Inkblot was acquired by Green Shield Holdings ...,2023-03-29,True,3808640.0,,,7,3.0,0.0,0.000000,1823


In [None]:
career['Speed_sum'] = np.nan
career['Count'] = np.nan

# Initialize the variables
count = 0
speed_sum = 0

# Iterate through the data
for i in range(1, len(career)):
    if career['index'].iloc[i] == 0:
        career.loc[i-1, 'Speed_sum'] = speed_sum
        career.loc[i-1, 'Count'] = count
        count = 0
        speed_sum = 0
    else:
        count += 1
        speed_sum += career['Speed'].iloc[i]

# The value of the last group
career.loc[len(career)-1, 'Speed_sum'] = speed_sum
career.loc[len(career)-1, 'Count'] = count
career


Unnamed: 0,CareerID,FounderID,CompanyID,JobTitle,DateRange,Start Date,End Date,Duration (years),Location,Description,...,founded_company_value,Headquarters Location,Number Of Employees,index,Level,diff,Speed,index_new,Speed_sum,Count
0,8,1,53036,vice president,2005–2009,2005-03-01,2009-09-01,4.5,,,...,,,,0,4.0,,,1,,
1,7,1,53036,executive director,2007–2009,2007-03-01,2009-09-01,2.5,"Moscow, Russian Federation",,...,,,,1,3.0,1.0,0.222222,1,,
2,216950,1,19921,founder,01/2016-Present,2016-01-01,,6.4,Cyprus,We are the largest global \ntravel mobility ma...,...,22450000.0,European Union (EU),,2,5.0,2.0,0.800000,1,1.022222,2.0
3,80467,3,27310,lead accountant,Nov 2001–Apr 2006,2001-11-01,2006-04-01,4.4,,,...,,,,0,2.0,0.0,0.000000,2,,
4,80414,3,39824,manager of finance and accounting cis region,May 2006–Oct 2008,2006-05-01,2008-10-01,2.4,,,...,,,,1,3.0,1.0,0.227273,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11331,238807,25746,127266,chief technical officer co founder,10/2015-12/2017,2015-10-01,2017-12-01,2.2,"Toronto, Canada Area",,...,12500000.0,"Toronto, Ontario",11-50,4,3.0,0.0,0.000000,1823,,
11332,312809,25746,152402,product strategist,1/2017-9/2018,2017-01-01,2018-09-01,1.7,,,...,,,,5,3.0,0.0,0.000000,1823,,
11333,238804,25746,127266,chief product officer co founder,12/2017-01/2019,2017-12-01,2019-01-01,1.1,"Toronto, Canada Area",,...,12500000.0,"Toronto, Ontario",11-50,6,3.0,0.0,0.000000,1823,,
11334,312807,25746,152400,chief executive officer co founder,12/2018-3/2021,2018-12-01,2021-03-01,2.2,,Inkblot was acquired by Green Shield Holdings ...,...,3808640.0,,,7,3.0,0.0,0.000000,1823,,


In [None]:
speed_of_growth = career[['CareerID','FounderID','CompanyID','index_new','Speed_sum','Count']]
speed_of_growth = speed_of_growth.dropna(subset=['Count'])

In [None]:
speed_of_growth['Speed_avg'] = speed_of_growth['Speed_sum'] / speed_of_growth['Count']
speed_of_growth

Unnamed: 0,CareerID,FounderID,CompanyID,index_new,Speed_sum,Count,Speed_avg
2,216950,1,19921,1,1.022222,2.0,0.511111
11,79402,3,17160,2,1.366729,8.0,0.170841
19,39,6,19921,3,0.865094,7.0,0.123585
23,107,13,20739,4,0.731707,3.0,0.243902
31,223,28,50729,5,7.515152,7.0,1.073593
...,...,...,...,...,...,...,...
11306,258565,25262,121204,1819,0.000000,2.0,0.000000
11311,238807,25746,127266,1820,1.923077,4.0,0.480769
11318,238804,25746,127266,1821,1.923077,6.0,0.320513
11326,312807,25746,152400,1822,1.923077,7.0,0.274725


In [None]:
final_result_df = final_result_df.merge(speed_of_growth[['CareerID','Speed_avg']], on='CareerID', how='left')

## Location relevance

In this section we want to investigate the relationship between specific locations and the veticals that are most likely to thrive there.

**Create Location-Vertical Dictionary**


By web scraping and manually mapping, we create a dictionary where the key is the location, and the value is their corresponding vertical.

**Remark**: A location can includes many verticals, and a vertical can be belong to many locations.

In [None]:
import requests
from bs4 import BeautifulSoup

def get_h3_texts_within_h2(city_name, section_id):
    url = f"https://en.wikipedia.org/wiki/{city_name}"
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')


        section_header = soup.find('span', id=section_id)
        if not section_header:
            return f"No section found with ID {section_id} in the {city_name} page."


        h2_tag = section_header.find_parent('h2')


        h3_texts = []
        for sibling in h2_tag.find_next_siblings():
            if sibling.name == 'h2':
                break
            if sibling.name == 'h3':
                header_span = sibling.find('span', class_='mw-headline')
                if header_span:
                    h3_texts.append(header_span.text)

        return h3_texts if h3_texts else f"No <h3> tags found within the section {section_id}."

    except requests.HTTPError as e:
        return f"HTTP error occurred: {e}"
    except Exception as e:
        return f"An error occurred: {e}"


cities_example = ['london', 'new york city', 'san francisco', 'berlin', 'paris', 'singapore',
       'washington (state)', 'barcelona', 'los angeles', 'madrid', 'munich',
       'copenhagen', 'mexico', 'milan', 'mumbai', 'boston', 'chicago',
       'stockholm', 'cambridge', 'vienna', 'new delhi', 'brussels', 'zurich',
       'frankfurt', 'Sao Paulo', 'amsterdam', 'toronto', 'oslo', 'hong kong',
       'beijing', 'helsinki', 'dubai', 'moscow', 'istanbul', 'oxford',
       'san diego', 'bengaluru', 'greece', 'shanghai', 'Kuala Lumpur',
       'nairobi', 'united kingdom', 'lisbon', 'philadelphia',
       'Seattle_metropolitan_area', 'montreal', 'sydney', 'gurgaon']
       #The locations in the list have not been cleaned.

section_id = "Economy"
results = {}

for city in cities_example:
    h3_texts = get_h3_texts_within_h2(city, section_id)
    results[city] = h3_texts

print(results)


{'london': ['City of London', 'Media and technology', 'Tourism'], 'new york city': ['Wall Street', 'Tech and biotech', 'Real estate', 'Tourism', 'Media and entertainment'], 'san francisco': ['Technology', 'Tourism and conventions'], 'berlin': ['Companies', 'Tourism and conventions', 'Creative industries', 'Media'], 'paris': ['Employment and income', 'Tourism'], 'singapore': 'No <h3> tags found within the section Economy.', 'washington (state)': ['Taxes', 'Agriculture', 'Wine', 'Military', 'Internet access'], 'barcelona': ['General information', 'Trade fair and exhibitions', 'Tourism', 'Manufacturing sector', 'Fashion'], 'los angeles': 'No <h3> tags found within the section Economy.', 'madrid': ['Economic history', 'Present-day economy', 'Media and entertainment'], 'munich': ['Manufacturing', 'Finance', 'Media'], 'copenhagen': ['Tourism'], 'mexico': ['Communications', 'Energy', 'Science and technology', 'Tourism', 'Transportation'], 'milan': ['Tourism'], 'mumbai': 'No <h3> tags found wi

In [None]:
verticals_by_location = {
    'london': ['Fintech', 'Cybersecurity', 'Legal tech', 'Adtech, Marketing tech', 'Media and telecommunications (TMT), Technology'],
    'new york': ['Fintech', 'Mortgage tech, Real estate tech', 'Cybersecurity', 'Adtech, Marketing tech', 'Media and telecommunications (TMT), Technology'],
    'san francisco': ['Cybersecurity', 'Cloudtech and DevOps, Software as a service (SaaS)', 'Artificial intelligence and machine learning (AI/ML)', 'Mobile'],
    'berlin': ['Gaming, eSports', 'Digital health, Lifestyles of Health and Sustainability (LOHAS) and wellness, Wearables and quantified self', 'Cleantech'],
    'paris': ['Beauty, Femtech', 'Ecommerce', 'Digital health, Lifestyles of Health and Sustainability (LOHAS) and wellness, Wearables and quantified self'],
    'singapore': ['Fintech', 'Supply chain technology', 'B2B payments, Mobile commerce', 'Insurtech'],
    'washington': ['Cleantech', 'Agtech', 'Space tech'],
    'barcelona': ['Ecommerce', 'Mobile'],
    'los angeles': ['Media and telecommunications (TMT), Technology', 'Audiotech', 'Ecommerce', 'Adtech, Marketing tech'],
    'madrid': ['Mortgage tech, Real estate tech', 'Digital health, Lifestyles of Health and Sustainability (LOHAS) and wellness, Wearables and quantified self'],
    'munich': ['Autonomous cars', '3D printing, Advanced manufacturing, Construction technology, Industrials, Infrastructure, Manufacturing', 'Robotics and drones'],
    'copenhagen': ['Cleantech', 'Digital health, Lifestyles of Health and Sustainability (LOHAS) and wellness, Wearables and quantified self'],
    'mexico': ['Energy', 'Media and telecommunications (TMT), Technology', 'Infrastructure'],
    'milan': ['Beauty, Femtech', 'Ecommerce'],
    'mumbai': ['Ecommerce', 'Media and telecommunications (TMT), Technology', '3D printing, Advanced manufacturing, Construction technology, Industrials, Infrastructure, Manufacturing'],
    'boston': ['Life sciences', 'Edtech', 'Healthtech'],
    'chicago': ['Fintech', 'Foodtech', 'HRtech'],
    'stockholm': ['Cleantech', 'Edtech', 'Digital health, Lifestyles of Health and Sustainability (LOHAS) and wellness, Wearables and quantified self'],
    'cambridge': ['Life sciences', 'Edtech'],
    'vienna': ['Healthtech'],
    'new delhi': ['Fintech', 'Healthtech', 'Edtech'],
    'zurich': ['Fintech', 'Insurtech'],
    'sao paulo': ['Fintech', 'Agtech', 'Ecommerce'],
    'amsterdam': ['Fintech', 'Ecommerce'],
    'toronto': ['Fintech', 'Media and telecommunications (TMT), Technology', 'Edtech'],
    'oslo': ['Oil and gas', 'Energy', 'Space tech'],
    'hong kong': ['Fintech', 'Mortgage tech, Real estate tech', 'Supply chain technology'],
    'beijing': ['Big Data', 'Artificial intelligence and machine learning (AI/ML)', 'Ecommerce'],
    'dubai': ['Mortgage tech, Real estate tech', 'Space tech', 'Ecommerce'],
    'istanbul': ['Foodtech', 'Agtech'],
    'oxford': ['Edtech', 'Life sciences'],
    'san diego': ['Healthtech', 'Life sciences', 'Cybersecurity'],
    'bengaluru': ['Cloudtech and DevOps, Software as a service (SaaS)', 'Space tech', 'Life sciences'],
    'greece': ['Agtech'],
    'kuala lumpur': ['Ecommerce', 'Edtech'],
    'nairobi': ['Agtech', 'Fintech', 'Edtech'],
    'seattle': ['Cloudtech and DevOps, Software as a Service (SaaS)', 'Aerospace', 'Cleantech'],
    'gurgaon': ["Fintech", "Real estate tech", "Automotive tech", "IT and software development", "Cloudtech and DevOps", "Artificial intelligence and machine learning (AI/ML)"],
    'united states': ['Cleantech', 'Cybersecurity', 'Artificial intelligence and machine learning (AI/ML)',  ],
}


**Get the Possible Predicted Verticals**

In [None]:
final_result_df['predicted verticals'] = final_result_df['Location'].map(verticals_by_location)

In [None]:
# Ensure the columns are strings
final_result_df['Vertical'] = final_result_df['Vertical'].astype(str)
final_result_df['predicted verticals'] = final_result_df['predicted verticals'].astype(str)

# Check if 'Location' is in 'predicted location' and assign 1 if true, 0 if false or NaN values
final_result_df['relevance'] = final_result_df.apply(lambda row: 1 if pd.notna(row['Vertical']) and row['Vertical'] != 'nan' and \
                             pd.notna(row['predicted verticals']) and row['Vertical'] in row['predicted verticals'] \
                             else 0, axis=1)

In [None]:
# Calculate the counts of 1s and 0s in the 'relevance' column
relevance_counts = final_result_df['relevance'].value_counts()

# Print the results
print(relevance_counts)

relevance
0    1748
1      78
Name: count, dtype: int64


## Consider the founding experience in the team size

In [None]:
# Group by 'CompanyID' and count the number of unique 'FounderID' for each company
founder_count_per_company = final_result_df.groupby('CompanyID')['FounderID'].count()

# Count the number of companies for each unique founder count and sort by the founder count
founder_distribution = founder_count_per_company.value_counts().sort_index()

print("Founder distribution per company:")
print(founder_distribution)

Founder distribution per company:
FounderID
1    1548
2      98
3      12
4       1
5       1
6       2
Name: count, dtype: int64


By analyzing the number of unique founders per company, we observe that most companies have only one founder. Therefore, we do not consider predicting entrepreneurial success based on team size.

## Classify the company to successful categories

Here we first move the 'Founded Company Value' to the last column. Then, based on the 'Founded Company Value', we classify the companies into binary and multi-level categories of success.

In [None]:
# Get all column names
columns = list(final_result_df.columns)

# Remove the 'Founded Company Value' column name
columns.remove('Founded Company Value')

# Append 'Founded Company Value' to the end of the column names list
columns.append('Founded Company Value')

# Reindex the DataFrame with the new column order
final_result_df = final_result_df.reindex(columns=columns)
final_result_df.head(5)

Unnamed: 0,FounderID,CompanyID,CareerID,Number of professional experience,Number of companies that worked for,Total Work Years,Industry Experience,Management experience,Entrepreneurial Experience,Technical experience,Location,Vertical,Speed_avg,relevance,Founded Company Value
0,1,19921,216950,2,1,7.0,0.0,7.0,0.0,2.5,other,"Carsharing,Micro-mobility,Mobility tech,Ridesh...",0.511111,0,22450000.0
1,3,17160,79402,8,8,31.1,0.0,19.6,9.9,0.0,other,Cryptocurrency and blockchain,0.170841,0,11852455.0
2,6,19921,39,7,6,18.9,0.0,10.3,0.0,6.7,other,"Carsharing,Micro-mobility,Mobility tech,Ridesh...",0.123585,0,22450000.0
3,13,20739,107,3,3,7.1,0.0,0.0,0.0,4.1,london,Foodtech,0.243902,0,350410.0
4,28,50729,223,7,7,7.2,0.0,5.5,0.0,0.0,other,Cryptocurrency and blockchain,1.073593,0,30000000.0


In [None]:
def get_success_binary_level(value):
    if value > 1000000000:
        return 'successful'
    else:
        return 'Not successful'

# Apply this function to 'Founded Company Value'
final_result_df['binary success'] = final_result_df['Founded Company Value'].apply(get_success_binary_level)

final_result_df

Unnamed: 0,FounderID,CompanyID,CareerID,Number of professional experience,Number of companies that worked for,Total Work Years,Industry Experience,Management experience,Entrepreneurial Experience,Technical experience,Location,Vertical,Speed_avg,relevance,Founded Company Value,binary success
0,1,19921,216950,2,1,7.0,0.0,7.0,0.0,2.5,other,"Carsharing,Micro-mobility,Mobility tech,Ridesh...",0.511111,0,22450000.0,Not successful
1,3,17160,79402,8,8,31.1,0.0,19.6,9.9,0.0,other,Cryptocurrency and blockchain,0.170841,0,11852455.0,Not successful
2,6,19921,39,7,6,18.9,0.0,10.3,0.0,6.7,other,"Carsharing,Micro-mobility,Mobility tech,Ridesh...",0.123585,0,22450000.0,Not successful
3,13,20739,107,3,3,7.1,0.0,0.0,0.0,4.1,london,Foodtech,0.243902,0,350410.0,Not successful
4,28,50729,223,7,7,7.2,0.0,5.5,0.0,0.0,other,Cryptocurrency and blockchain,1.073593,0,30000000.0,Not successful
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1818,25176,161268,333056,8,6,20.7,11.6,2.8,12.7,0.0,other,Artificial intelligence and machine learning (...,0.644841,0,875000.0,Not successful
1820,25262,121205,222394,1,1,6.5,0.0,6.5,0.0,0.0,other,Restaurant tech,0.000000,0,750000.0,Not successful
1821,25262,121204,258565,2,2,8.8,0.0,8.8,2.3,0.0,san francisco,"Augmented reality (AR),Virtual reality (VR)",0.000000,0,404682445.0,Not successful
1822,25746,127266,238807,4,4,14.6,0.0,2.6,9.4,2.6,toronto,HRtech,0.480769,0,12500000.0,Not successful


In [None]:
def get_success_multi_level(value):
    if value > 1000000000:
        return 'Highly successful'
    elif 100000000 < value <= 1000000000:
        return 'Successful'
    elif 50000000 < value <= 100000000:
        return 'Moderately Successful'
    elif 10000000 < value <= 50000000:
        return 'Promising'
    else:
        return 'Not successful'

# Apply this function to 'Founded Company Value'
final_result_df['Level of success'] = final_result_df['Founded Company Value'].apply(get_success_multi_level)

final_result_df

Unnamed: 0,FounderID,CompanyID,CareerID,Number of professional experience,Number of companies that worked for,Total Work Years,Industry Experience,Management experience,Entrepreneurial Experience,Technical experience,Location,Vertical,Speed_avg,relevance,Founded Company Value,binary success,Level of success
0,1,19921,216950,2,1,7.0,0.0,7.0,0.0,2.5,other,"Carsharing,Micro-mobility,Mobility tech,Ridesh...",0.511111,0,22450000.0,Not successful,Promising
1,3,17160,79402,8,8,31.1,0.0,19.6,9.9,0.0,other,Cryptocurrency and blockchain,0.170841,0,11852455.0,Not successful,Promising
2,6,19921,39,7,6,18.9,0.0,10.3,0.0,6.7,other,"Carsharing,Micro-mobility,Mobility tech,Ridesh...",0.123585,0,22450000.0,Not successful,Promising
3,13,20739,107,3,3,7.1,0.0,0.0,0.0,4.1,london,Foodtech,0.243902,0,350410.0,Not successful,Not successful
4,28,50729,223,7,7,7.2,0.0,5.5,0.0,0.0,other,Cryptocurrency and blockchain,1.073593,0,30000000.0,Not successful,Promising
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1818,25176,161268,333056,8,6,20.7,11.6,2.8,12.7,0.0,other,Artificial intelligence and machine learning (...,0.644841,0,875000.0,Not successful,Not successful
1820,25262,121205,222394,1,1,6.5,0.0,6.5,0.0,0.0,other,Restaurant tech,0.000000,0,750000.0,Not successful,Not successful
1821,25262,121204,258565,2,2,8.8,0.0,8.8,2.3,0.0,san francisco,"Augmented reality (AR),Virtual reality (VR)",0.000000,0,404682445.0,Not successful,Successful
1822,25746,127266,238807,4,4,14.6,0.0,2.6,9.4,2.6,toronto,HRtech,0.480769,0,12500000.0,Not successful,Promising


In [None]:
final_result_df.columns

Index(['FounderID', 'CompanyID', 'CareerID',
       'Number of professional experience',
       'Number of companies that worked for', 'Total Work Years',
       'Industry Experience', 'Management experience',
       'Entrepreneurial Experience', 'Technical experience', 'Location',
       'Vertical', 'Speed_avg', 'relevance', 'Founded Company Value',
       'binary success', 'Level of success'],
      dtype='object')

Here, we have the final dataset ready for our prediction model. The first three columns represent the IDs related to the founding experience. Columns four to fourteen are the explanatory variables, and the last three columns are the response variables.