**Import the relevant libraries**

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
from sklearn.linear_model import LinearRegression


warnings.filterwarnings('ignore')

**Load the data**

In [8]:
data = pd.read_csv(r'C:\Users\user\Downloads\cleaned_startup_data.xls')

In [10]:
data

Unnamed: 0,company_id,category_code,country_code,state_code,total_rounds,average_participants,offices,ipo,is_acquired,is_closed,age_years,status,log_average_funded
0,c:10015,health,USA,California,5,3.8,1.0,0,0,0,6.26,0,16.426597
1,c:100228,education,USA,other,1,0.0,3.0,0,0,0,3.25,0,11.512925
2,c:100607,games_video,USA,California,1,1.0,1.0,0,0,0,3.00,0,10.596635
3,c:10075,web,USA,California,1,0.0,1.0,0,0,0,6.15,0,12.765688
4,c:100756,education,other,other,1,1.0,1.0,0,0,0,5.00,0,13.234838
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7505,c:99669,enterprise,USA,other,1,6.0,1.0,0,0,0,3.00,0,13.527828
7506,c:9975,web,other,other,1,0.0,1.0,0,0,0,6.30,0,13.590714
7507,c:9977,mobile,USA,California,1,0.0,1.0,0,0,1,5.62,0,11.775290
7508,c:9995,public_relations,USA,California,1,0.0,1.0,0,0,0,6.17,0,13.527828


In [12]:
#descriptive statistic
data.describe()

Unnamed: 0,total_rounds,average_participants,offices,ipo,is_acquired,is_closed,age_years,status,log_average_funded
count,7510.0,7510.0,7510.0,7510.0,7510.0,7510.0,7510.0,7510.0,7510.0
mean,1.777097,1.202918,1.130892,0.001465,0.085619,0.071372,4.532304,0.087084,13.845281
std,1.039688,1.463043,0.389695,0.038246,0.27982,0.257462,1.154526,0.281977,1.794007
min,1.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,5.673323
25%,1.0,0.0,1.0,0.0,0.0,0.0,3.59,0.0,12.686936
50%,1.0,1.0,1.0,0.0,0.0,0.0,4.33,0.0,13.997832
75%,2.0,2.0,1.0,0.0,0.0,0.0,5.4975,0.0,15.201805
max,5.0,7.6667,3.0,1.0,1.0,1.0,7.0,1.0,17.639795


Following the results obtained from the simple multiple linear regression,
I did not include 'offices' and 'is_acquired' in this regression with sklearn,
because they are not statistically significant

In [16]:
#declare the dependent and independent variable
x = data[['total_rounds', 'average_participants', 'ipo', 'is_closed', 'age_years', 'status']]
y = data['log_average_funded']

In [18]:
#regression itself
reg = LinearRegression()
reg.fit(x,y)

In [20]:
#r-square
reg.score(x,y)

0.2440329912317336

In [22]:
#intercept
reg.intercept_

11.942042262560859

In [24]:
#coefficients
reg.coef_

array([ 0.37290024,  0.35676797,  1.08800008, -0.87613718,  0.17839037,
        0.73278804])

In [26]:
#find the adjusted r-squared
def adj_r2(x,y):
    r2 = reg.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [28]:
#call the adjusted r-square function to get the result
adj_r2(x,y)

0.24342845943743663

**Calculate the univariate p-values of the variables**

**Feature Selection**

In [32]:
from sklearn.feature_selection import f_regression
f_regression(x,y)

(array([ 894.40760231, 1219.62488543,   11.82714283,  224.21789595,
         170.41637753,  381.4976664 ]),
 array([9.04773941e-186, 9.66887384e-248, 5.86907688e-004, 5.69666096e-050,
        1.57329990e-038, 6.54079811e-083]))

In [34]:
#get the p values
p_values = f_regression(x,y)[1]
p_values

array([9.04773941e-186, 9.66887384e-248, 5.86907688e-004, 5.69666096e-050,
       1.57329990e-038, 6.54079811e-083])

In [38]:
#round to 3 d.p.
p_values.round(3)

array([0.   , 0.   , 0.001, 0.   , 0.   , 0.   ])

**Create a summary table**

In [41]:
reg_summary = pd.DataFrame(data=x.columns.values, columns=['Features'])
reg_summary['Coefficients'] = reg.coef_
reg_summary['P-values'] = p_values.round(3)

In [43]:
reg_summary

Unnamed: 0,Features,Coefficients,P-values
0,total_rounds,0.3729,0.0
1,average_participants,0.356768,0.0
2,ipo,1.088,0.001
3,is_closed,-0.876137,0.0
4,age_years,0.17839,0.0
5,status,0.732788,0.0
