# Task for Today  

***

## Startup Funding Prediction  

Given *data about startups in India*, let's try to predict the **funding** provided to a given startup.

We will use a TensorFlow/Keras neural network within a scikit-learn pipeline to make our predictions.

# Getting Started

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import tensorflow as tf

from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('startup_funding.csv')

In [3]:
data

Unnamed: 0,Sr No,Date dd/mm/yyyy,Startup Name,Industry Vertical,SubVertical,City Location,Investors Name,InvestmentnType,Amount in USD,Remarks
0,1,09/01/2020,BYJU’S,E-Tech,E-learning,Bengaluru,Tiger Global Management,Private Equity Round,200000000,
1,2,13/01/2020,Shuttl,Transportation,App based shuttle service,Gurgaon,Susquehanna Growth Equity,Series C,8048394,
2,3,09/01/2020,Mamaearth,E-commerce,Retailer of baby and toddler products,Bengaluru,Sequoia Capital India,Series B,18358860,
3,4,02/01/2020,https://www.wealthbucket.in/,FinTech,Online Investment,New Delhi,Vinod Khatumal,Pre-series A,3000000,
4,5,02/01/2020,Fashor,Fashion and Apparel,Embroiled Clothes For Women,Mumbai,Sprout Venture Partners,Seed Round,1800000,
...,...,...,...,...,...,...,...,...,...,...
3039,3040,29/01/2015,Printvenue,,,,Asia Pacific Internet Group,Private Equity,4500000,
3040,3041,29/01/2015,Graphene,,,,KARSEMVEN Fund,Private Equity,825000,Govt backed VC Fund
3041,3042,30/01/2015,Mad Street Den,,,,"Exfinity Fund, GrowX Ventures.",Private Equity,1500000,
3042,3043,30/01/2015,Simplotel,,,,MakeMyTrip,Private Equity,,"Strategic Funding, Minority stake"


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044 entries, 0 to 3043
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Sr No              3044 non-null   int64 
 1   Date dd/mm/yyyy    3044 non-null   object
 2   Startup Name       3044 non-null   object
 3   Industry Vertical  2873 non-null   object
 4   SubVertical        2108 non-null   object
 5   City  Location     2864 non-null   object
 6   Investors Name     3020 non-null   object
 7   InvestmentnType    3040 non-null   object
 8   Amount in USD      2084 non-null   object
 9   Remarks            419 non-null    object
dtypes: int64(1), object(9)
memory usage: 237.9+ KB


# Preprocessing

In [5]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop ID and high-cardinality columns
    df = df.drop(['Sr No', 'Startup Name', 'SubVertical', 'Investors Name'], axis=1)
    
    # Clean \\xc2\\xa0 examples
    df = df.applymap(lambda x: x.replace(r'\\xc2\\xa0', '') if type(x) == str else x)
    
    # Clean target column
    df['Amount in USD'] = df['Amount in USD'].apply(lambda x: x.replace(',', '') if str(x) != 'nan' else x)
    df['Amount in USD'] = df['Amount in USD'].replace({
        'undisclosed': np.NaN,
        'unknown': np.NaN,
        'Undisclosed': np.NaN,
        'N/A': np.NaN,
        '14342000+': '14342000'
    })
    
    # Drop missing target rows
    missing_target_rows = df[df['Amount in USD'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # Drop columns with more than 25% missing values
    df = df.drop('Remarks', axis=1)
    
    # Fill categorical missing values with most frequent occurence
    for column in ['Industry Vertical', 'City  Location', 'InvestmentnType']:
        df[column] = df[column].fillna(df[column].mode()[0])
    
    # Clean date column
    df['Date dd/mm/yyyy'] = df['Date dd/mm/yyyy'].replace({
        '05/072018': '05/07/2018',
        '01/07/015': '01/07/2015',
        '22/01//2015': '22/01/2015'
    })
    
    # Extract date features
    df['Date dd/mm/yyyy'] = pd.to_datetime(df['Date dd/mm/yyyy'])
    df['Year'] = df['Date dd/mm/yyyy'].apply(lambda x: x.year)
    df['Month'] = df['Date dd/mm/yyyy'].apply(lambda x: x.month)
    df['Day'] = df['Date dd/mm/yyyy'].apply(lambda x: x.day)
    df = df.drop('Date dd/mm/yyyy', axis=1)
    
    # Convert target column to float
    df['Amount in USD'] = df['Amount in USD'].astype(np.float)
    
    # Split df into X and y
    y = df['Amount in USD']
    X = df.drop('Amount in USD', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [7]:
X_train

Unnamed: 0,Industry Vertical,City Location,InvestmentnType,Year,Month,Day
924,eCommerce,Noida,Private Equity,2016,4,10
1108,Consumer Internet,Bangalore,Seed Funding,2016,7,21
1059,Consumer Internet,Mumbai,Private Equity,2016,8,29
160,Consumer Internet,Bengaluru,Seed/ Angel Funding,2018,8,8
1696,on-demand delivery service,Gurgaon,Seed Funding,2015,8,17
...,...,...,...,...,...,...
960,eCommerce,Ahmedabad,Private Equity,2016,10,26
905,Consumer Internet,Mumbai,Private Equity,2016,11,24
1096,eCommerce,New Delhi,Seed Funding,2016,7,15
235,Finance,Chennai,Seed / Angel Funding,2018,2,5


In [8]:
y_train

924     4200000.0
1108     595000.0
1059    3000000.0
160     4000000.0
1696     310000.0
          ...    
960     1000000.0
905     4000000.0
1096     250000.0
235      450000.0
1061    1000000.0
Name: Amount in USD, Length: 1451, dtype: float64

# Building Pipeline

In [9]:
def build_model():
    inputs = tf.keras.Input(shape=(532,))
    x = tf.keras.layers.Dense(128, activation='relu')(inputs)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)
    
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(
        optimizer='adam',
        loss='mse'
    )
    
    return model

In [10]:
nominal_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('nominal', nominal_transformer, ['Industry Vertical', 'City  Location', 'InvestmentnType'])
], remainder='passthrough')

regressor = tf.keras.wrappers.scikit_learn.KerasRegressor(build_model)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('regressor', regressor)
])

# Training

In [11]:
model.fit(
    X_train,
    y_train,
    regressor__validation_split=0.2,
    regressor__batch_size=32,
    regressor__epochs=100,
    regressor__callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('nominal',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['Industry Vertical',
                                                   'City  Location',
                                                   'InvestmentnType'])])),
                ('scaler', StandardScaler()),
                ('regressor',
                 <tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor object at 0x7866d8365790>)])

# Results

In [12]:
y_pred = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print("     Test RMSE: {:.2f}".format(rmse))

r2 = r2_score(y_test, y_pred)
print("Test R^2 Score: {:.5f}".format(r2))

     Test RMSE: 56455266.04
Test R^2 Score: -0.02415


# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/K5NqUMZomYE