In [139]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.impute import SimpleImputer
from scipy import sparse
import statsmodels.api as sm
from sklearn.pipeline import make_pipeline, Pipeline

In [4]:
df = pd.read_csv('buyouts_funds/buyouts_funds_csv/buyouts_funds_final.csv')

In [5]:
df.head()

Unnamed: 0,fund_id,fund_name,fm_id,strategy,sector,region,country,target_fund_size_amt,target_fund_size_currency,fund_size_amt,fund_currency,status,stage,vintage_year,open_date
0,34866,Ascent Venture Partners VII,6381,Venture Capital / Growth Equity,TMT,North America,United States,150000000.0,USD,,,Launched,,2020,2020-0
1,43446,Volunteer Park Capital,19143,Mezzanine / Debt,Diversified,North America,,200000000.0,USD,30000000.0,USD,Currently Investing,1st Close,2019,2019-8
2,44723,Flashstarts Blockchain Fund,14468,Venture Capital / Growth Equity,TMT,North America,United States,6000000.0,USD,,,Launched,,2019,2019-1
3,44727,Harmony Partners IV,10317,Venture Capital / Growth Equity,TMT,North America,United States,125000000.0,USD,,,Launched,,2019,2019-1
4,44738,Light Street Beacon Principals I,19412,Venture Capital / Growth Equity,TMT,North America,United States,20000000.0,USD,,,Launched,,2019,2019-1


#### Using all features in buyouts_funds_final to predict target_fund_size_amt

In [120]:
allfeatures = ['fund_id', 'fund_name', 'fm_id', 'strategy', 'sector',
              'region', 'country', 'target_fund_size_amt',
              'target_fund_size_currency', 'fund_size_amt', 'fund_currency',
              'status', 'stage', 'vintage_year', 'open_date']

In [121]:
for i in allfeatures:
    df[i] = df[i].fillna(0)

In [122]:
df['fund_name'] = pd.to_numeric(df['fund_name'], errors='coerce')
df['strategy'] = pd.to_numeric(df['strategy'], errors='coerce')
df['sector'] = pd.to_numeric(df['sector'], errors='coerce')
df['region'] = pd.to_numeric(df['region'], errors='coerce')
df['country'] = pd.to_numeric(df['country'], errors='coerce')
df['target_fund_size_currency'] = pd.to_numeric(df['target_fund_size_currency'], errors='coerce')
df['fund_currency'] = pd.to_numeric(df['fund_currency'], errors='coerce')
df['status'] = pd.to_numeric(df['status'], errors='coerce')
df['stage'] = pd.to_numeric(df['stage'], errors='coerce')
df['open_date'] = pd.to_numeric(df['open_date'], errors='coerce')

In [123]:
featurenames = ['fund_id', 'fund_name', 'fm_id', 'strategy', 'sector',
               'region', 'country', 'target_fund_size_currency',
                'fund_size_amt', 'fund_currency', 'status', 'stage',
                'vintage_year', 'open_date']

In [124]:
col_cat = ['fund_name', 'strategy', 'sector', 'region', 'country',
          'target_fund_size_currency', 'fund_currency', 'status',
          'stage', 'open_date']

col_num = ['fund_id', 'fm_id', 'fund_size_amt', 'vintage_year']

In [125]:
X = df[featurenames]
y = df.target_fund_size_amt

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                   random_state=0)

In [127]:
X_train_cat = X_train[col_cat]
X_train_num = X_train[col_num]

X_test_cat = X_test[col_cat]
X_test_num = X_test[col_num]

In [128]:
ohe = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder())
X_train_cat_enc = ohe.fit_transform(X_train_cat)
X_test_cat_enc = ohe.fit_transform(X_test_cat)

scaler_num = make_pipeline(StandardScaler(), SimpleImputer())
X_train_num_scaled = scaler_num.fit_transform(X_train_num)
X_test_num_scaled = scaler_num.fit_transform(X_test_num)

In [129]:
X_train_scaled = sparse.hstack((X_train_cat_enc,
                                sparse.csr_matrix(X_train_num_scaled)))
X_test_scaled = sparse.hstack((X_test_cat_enc,
                               sparse.csr_matrix(X_test_num_scaled)))

In [130]:
pipe_lasso = make_pipeline(Lasso(alpha=100, max_iter=1000))

In [136]:
lasso = pipe_lasso.fit(X_train_scaled, y_train)

In [137]:
print("Test set score: {:.2f}".format(lasso.score(X_test_scaled, y_test)))
print("Number of features used:", np.sum(lasso['lasso'].coef_ != 0))

Test set score: 0.22
Number of features used: 4


In [150]:
param_grid={'lasso__alpha': np.logspace(1,7,num=13),
           'lasso__max_iter': [10000]}

In [151]:
grid = GridSearchCV(pipe_lasso, param_grid=param_grid, cv=10, n_jobs=-1)

In [152]:
grid.fit(X_train_scaled, y_train)

GridSearchCV(cv=10, estimator=Pipeline(steps=[('lasso', Lasso(alpha=100))]),
             n_jobs=-1,
             param_grid={'lasso__alpha': array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07]),
                         'lasso__max_iter': [10000]})

In [153]:
print("Score for the best model: {:.2f}".format(grid.score(X_test_scaled,
                                                           y_test)))

Score for the best model: 0.22


In [154]:
grid.best_params_

{'lasso__alpha': 31622.776601683792, 'lasso__max_iter': 10000}

In [155]:
grid.best_estimator_

Pipeline(steps=[('lasso', Lasso(alpha=31622.776601683792, max_iter=10000))])

In [157]:
X_train.columns[(lasso['lasso'].coef_) != 0]

Index(['status', 'stage', 'vintage_year', 'open_date'], dtype='object')

#### Using all features in buyouts_funds_final to predict fund_size_amt

In [158]:
featurenames2 = ['fund_id', 'fund_name', 'fm_id', 'strategy', 'sector',
               'region', 'country', 'target_fund_size_amt',
                 'target_fund_size_currency', 'fund_currency', 'status',
                 'stage', 'vintage_year', 'open_date']

In [159]:
col_cat2 = ['fund_name', 'strategy', 'sector', 'region', 'country',
          'target_fund_size_currency', 'fund_currency', 'status',
          'stage', 'open_date']

col_num2 = ['fund_id', 'fm_id', 'target_fund_size_amt', 'vintage_year']

In [160]:
X2 = df[featurenames2]
y2 = df.fund_size_amt

In [161]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.1,
                                                   random_state=0)

In [163]:
X_train_cat2 = X_train2[col_cat2]
X_train_num2 = X_train2[col_num2]

X_test_cat2 = X_test2[col_cat2]
X_test_num2 = X_test2[col_num2]

In [164]:
ohe2 = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder())
X_train_cat_enc2 = ohe2.fit_transform(X_train_cat2)
X_test_cat_enc2 = ohe2.fit_transform(X_test_cat2)

scaler_num2 = make_pipeline(StandardScaler(), SimpleImputer())
X_train_num_scaled2 = scaler_num2.fit_transform(X_train_num2)
X_test_num_scaled2 = scaler_num2.fit_transform(X_test_num2)

In [165]:
X_train_scaled2 = sparse.hstack((X_train_cat_enc2,
                                sparse.csr_matrix(X_train_num_scaled2)))
X_test_scaled2 = sparse.hstack((X_test_cat_enc2,
                               sparse.csr_matrix(X_test_num_scaled2)))

In [166]:
pipe_lasso2 = make_pipeline(Lasso(alpha=100, max_iter=1000))

In [167]:
lasso2 = pipe_lasso2.fit(X_train_scaled2, y_train2)

In [173]:
print("Test set score: {:.2f}".format(lasso2.score(X_test_scaled2, y_test2)))
print("Number of features used:", np.sum(lasso2['lasso'].coef_ != 0))

Test set score: 0.16
Number of features used: 4


In [174]:
param_grid2={'lasso__alpha': np.logspace(1,7,num=13),
           'lasso__max_iter': [10000]}

In [175]:
grid2 = GridSearchCV(pipe_lasso2, param_grid=param_grid2, cv=10, n_jobs=-1)

In [176]:
grid2.fit(X_train_scaled2, y_train2)

GridSearchCV(cv=10, estimator=Pipeline(steps=[('lasso', Lasso(alpha=100))]),
             n_jobs=-1,
             param_grid={'lasso__alpha': array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07]),
                         'lasso__max_iter': [10000]})

In [177]:
print("Score for the best model: {:.2f}".format(grid2.score(X_test_scaled2,
                                                           y_test2)))

Score for the best model: 0.17


In [178]:
grid2.best_params_

{'lasso__alpha': 1000000.0, 'lasso__max_iter': 10000}

In [179]:
grid2.best_estimator_

Pipeline(steps=[('lasso', Lasso(alpha=1000000.0, max_iter=10000))])

In [180]:
X_train2.columns[(lasso2['lasso'].coef_) != 0]

Index(['status', 'stage', 'vintage_year', 'open_date'], dtype='object')

#### Using all features in buyouts_fm_fund_history to predict fund_size_amt

In [271]:
df2 = pd.read_csv('buyouts_funds/buyouts_funds_csv/buyouts_fm_fund_history.csv')

In [272]:
df2.head()

Unnamed: 0,fm_fund_id,fm_id,fund_id,vintage_year,region,sector,strategy,fund_size_amt,fund_currency,target_fund_size_amt,target_fund_size_currency
0,6121,6381,6420,1995,North America,TMT,Venture Capital / Growth Equity,80000000.0,USD,0.0,USD
1,9222,6381,8305,2004,North America,TMT,Venture Capital / Growth Equity,140000000.0,USD,0.0,USD
2,16420,6381,9222,2008,North America,TMT,Venture Capital / Growth Equity,81750000.0,USD,81750000.0,USD
3,29148,6381,12856,1999,North America,TMT,Venture Capital / Growth Equity,115000000.0,USD,0.0,USD
4,5732,6381,34846,2015,North America,TMT,Venture Capital / Growth Equity,71050000.0,USD,150000000.0,USD


In [276]:
allfeatures2 = ['fm_fund_id', 'fm_id', 'fund_id', 'vintage_year', 'region', 'sector',
               'strategy', 'fund_size_amt', 'fund_currency', 'target_fund_size_amt',
               'target_fund_size_currency']

In [277]:
for i in allfeatures2:
    df2[i] = df2[i].fillna(0)

In [280]:
df2['region'] = pd.to_numeric(df2['region'], errors='coerce')
df2['sector'] = pd.to_numeric(df2['sector'], errors='coerce')
df2['strategy'] = pd.to_numeric(df2['strategy'], errors='coerce')
df2['fund_currency'] = pd.to_numeric(df2['fund_currency'], errors='coerce')
df2['target_fund_size_currency'] = pd.to_numeric(df2['target_fund_size_currency'], errors='coerce')

In [281]:
featurenames3 = ['fm_fund_id', 'fm_id', 'fund_id', 'vintage_year', 'region',
                'sector', 'strategy', 'fund_currency', 'target_fund_size_amt',
                'target_fund_size_currency']

In [282]:
col_cat3 = ['region', 'sector', 'strategy', 'fund_currency', 'target_fund_size_currency']

col_num3 = ['fm_fund_id', 'fm_id', 'fund_id', 'vintage_year', 'target_fund_size_amt']

In [283]:
X3 = df2[featurenames3]
y3 = df2.fund_size_amt

In [284]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.1,
                                                   random_state=0)

In [285]:
X_train_cat3 = X_train3[col_cat3]
X_train_num3 = X_train3[col_num3]

X_test_cat3 = X_test3[col_cat3]
X_test_num3 = X_test3[col_num3]

In [286]:
ohe3 = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder())
X_train_cat_enc3 = ohe3.fit_transform(X_train_cat3)
X_test_cat_enc3 = ohe3.fit_transform(X_test_cat3)

scaler_num3 = make_pipeline(StandardScaler(), SimpleImputer())
X_train_num_scaled3 = scaler_num3.fit_transform(X_train_num3)
X_test_num_scaled3 = scaler_num3.fit_transform(X_test_num3)

In [287]:
X_train_scaled3 = sparse.hstack((X_train_cat_enc3,
                                sparse.csr_matrix(X_train_num_scaled3)))
X_test_scaled3 = sparse.hstack((X_test_cat_enc3,
                               sparse.csr_matrix(X_test_num_scaled3)))

In [288]:
pipe_lasso3 = make_pipeline(Lasso(alpha=100, max_iter=1000))

In [289]:
lasso3 = pipe_lasso3.fit(X_train_scaled3, y_train3)

In [290]:
print("Test set score: {:.2f}".format(lasso3.score(X_test_scaled3, y_test3)))
print("Number of features used:", np.sum(lasso3['lasso'].coef_ != 0))

Test set score: -1.51
Number of features used: 5


In [291]:
param_grid3={'lasso__alpha': np.logspace(1,7,num=13),
           'lasso__max_iter': [10000]}

In [292]:
grid3 = GridSearchCV(pipe_lasso3, param_grid=param_grid3, cv=10, n_jobs=-1)

In [293]:
grid3.fit(X_train_scaled3, y_train3)

GridSearchCV(cv=10, estimator=Pipeline(steps=[('lasso', Lasso(alpha=100))]),
             n_jobs=-1,
             param_grid={'lasso__alpha': array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07]),
                         'lasso__max_iter': [10000]})

In [294]:
grid3.best_params_

{'lasso__alpha': 10000000.0, 'lasso__max_iter': 10000}

In [295]:
grid3.best_estimator_

Pipeline(steps=[('lasso', Lasso(alpha=10000000.0, max_iter=10000))])

In [296]:
X_train3.columns[(lasso3['lasso'].coef_) != 0]

Index(['sector', 'strategy', 'fund_currency', 'target_fund_size_amt',
       'target_fund_size_currency'],
      dtype='object')

#### Using all features in buyouts_fm_fund_history to predict target_fund_size_amt

In [297]:
featurenames4 = ['fm_fund_id', 'fm_id', 'fund_id', 'vintage_year', 'region',
                'sector', 'strategy', 'fund_size_amt', 'fund_currency',
                'target_fund_size_currency']

In [298]:
col_cat4 = ['region', 'sector', 'strategy', 'fund_currency', 'target_fund_size_currency']

col_num4 = ['fm_fund_id', 'fm_id', 'fund_id', 'vintage_year', 'fund_size_amt']

In [299]:
X4 = df2[featurenames4]
y4 = df2.target_fund_size_amt

In [300]:
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size=0.1,
                                                   random_state=0)

In [301]:
X_train_cat4 = X_train4[col_cat4]
X_train_num4 = X_train4[col_num4]

X_test_cat4 = X_test4[col_cat4]
X_test_num4 = X_test4[col_num4]

In [302]:
ohe4 = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder())
X_train_cat_enc4 = ohe4.fit_transform(X_train_cat4)
X_test_cat_enc4 = ohe4.fit_transform(X_test_cat4)

scaler_num4 = make_pipeline(StandardScaler(), SimpleImputer())
X_train_num_scaled4 = scaler_num4.fit_transform(X_train_num4)
X_test_num_scaled4 = scaler_num4.fit_transform(X_test_num4)

In [303]:
X_train_scaled4 = sparse.hstack((X_train_cat_enc4,
                                sparse.csr_matrix(X_train_num_scaled4)))
X_test_scaled4 = sparse.hstack((X_test_cat_enc4,
                               sparse.csr_matrix(X_test_num_scaled4)))

In [304]:
pipe_lasso4 = make_pipeline(Lasso(alpha=100, max_iter=1000))

In [305]:
lasso4 = pipe_lasso4.fit(X_train_scaled4, y_train4)

In [306]:
print("Test set score: {:.2f}".format(lasso4.score(X_test_scaled4, y_test4)))
print("Number of features used:", np.sum(lasso4['lasso'].coef_ != 0))

Test set score: -1.80
Number of features used: 5


In [307]:
param_grid4={'lasso__alpha': np.logspace(1,7,num=13),
           'lasso__max_iter': [10000]}

In [308]:
grid4 = GridSearchCV(pipe_lasso4, param_grid=param_grid4, cv=10, n_jobs=-1)

In [309]:
grid4.fit(X_train_scaled4, y_train4)

GridSearchCV(cv=10, estimator=Pipeline(steps=[('lasso', Lasso(alpha=100))]),
             n_jobs=-1,
             param_grid={'lasso__alpha': array([1.00000000e+01, 3.16227766e+01, 1.00000000e+02, 3.16227766e+02,
       1.00000000e+03, 3.16227766e+03, 1.00000000e+04, 3.16227766e+04,
       1.00000000e+05, 3.16227766e+05, 1.00000000e+06, 3.16227766e+06,
       1.00000000e+07]),
                         'lasso__max_iter': [10000]})

In [310]:
grid4.best_params_

{'lasso__alpha': 10000000.0, 'lasso__max_iter': 10000}

In [311]:
grid4.best_estimator_

Pipeline(steps=[('lasso', Lasso(alpha=10000000.0, max_iter=10000))])

In [312]:
X_train4.columns[(lasso4['lasso'].coef_) != 0]

Index(['sector', 'strategy', 'fund_size_amt', 'fund_currency',
       'target_fund_size_currency'],
      dtype='object')

#### Using all features in buyouts_historical_funds to predict fund_size_amt 

In [319]:
df3 = pd.read_csv('buyouts_funds/buyouts_funds_csv/buyouts_historical_funds.csv')

In [320]:
df3.head()

Unnamed: 0,fund_id,fund_name,manager_id,manager,close_year,strategy,fund_size_amt,fund_currency
0,23717,Warburg Pincus Associates,5591.0,Warburg Pincus,1980,Venture Capital / Growth Equity,101000000.0,USD
1,14874,Kleiner Perkins Caufield & Byers II,6562.0,Kleiner Perkins,1980,Venture Capital / Growth Equity,55000000.0,USD
2,10708,TA Associates / Advent IV,6694.0,TA Associates,1980,Venture Capital / Growth Equity,60000000.0,USD
3,15603,Excelsior Fund,6373.0,Apax Partners,1981,Venture Capital / Growth Equity,25530000.0,USD
4,15213,New Enterprise Associates II,6610.0,New Enterprise Associates,1981,Venture Capital / Growth Equity,45310000.0,USD


In [321]:
allfeatures3 = ['fund_id', 'fund_name', 'manager_id', 'manager', 'close_year', 'strategy',
               'fund_size_amt', 'fund_currency']

In [322]:
for i in allfeatures3:
    df3[i] = df3[i].fillna(0)

In [329]:
df3['fund_name'] = pd.to_numeric(df3['fund_name'], errors='coerce')
df3['manager'] = pd.to_numeric(df3['manager'], errors='coerce')
df3['strategy'] = pd.to_numeric(df3['strategy'], errors='coerce')
df3['fund_currency'] = pd.to_numeric(df3['fund_currency'], errors='coerce')

In [330]:
featurenames5 = ['fund_id', 'fund_name', 'manager_id', 'manager', 'close_year', 'strategy',
               'fund_currency']

In [331]:
col_cat5 = ['fund_name', 'manager', 'strategy', 'fund_currency']

col_num5 = ['fund_id', 'manager_id', 'close_year']

In [332]:
X5 = df3[featurenames5]
y5 = df3.fund_size_amt

In [333]:
X_train5, X_test5, y_train5, y_test5 = train_test_split(X5, y5, test_size=0.1,
                                                   random_state=0)

In [334]:
X_train_cat5 = X_train5[col_cat5]
X_train_num5 = X_train5[col_num5]

X_test_cat5 = X_test5[col_cat5]
X_test_num5 = X_test5[col_num5]

In [335]:
ohe5 = make_pipeline(SimpleImputer(strategy='constant'), OneHotEncoder())
X_train_cat_enc5 = ohe5.fit_transform(X_train_cat5)
X_test_cat_enc5 = ohe5.fit_transform(X_test_cat5)

scaler_num5 = make_pipeline(StandardScaler(), SimpleImputer())
X_train_num_scaled5 = scaler_num5.fit_transform(X_train_num5)
X_test_num_scaled5 = scaler_num5.fit_transform(X_test_num5)

In [336]:
X_train_scaled5 = sparse.hstack((X_train_cat_enc5,
                                sparse.csr_matrix(X_train_num_scaled5)))
X_test_scaled5 = sparse.hstack((X_test_cat_enc5,
                               sparse.csr_matrix(X_test_num_scaled5)))

In [337]:
pipe_lasso5 = make_pipeline(Lasso(alpha=100, max_iter=1000))

In [338]:
lasso5 = pipe_lasso5.fit(X_train_scaled5, y_train5)

In [339]:
print("Test set score: {:.2f}".format(lasso5.score(X_test_scaled5, y_test5)))
print("Number of features used:", np.sum(lasso5['lasso'].coef_ != 0))

ValueError: dimension mismatch