In [132]:
import pandas as pd
import dask as dd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import statsmodels.formula.api as smf
import statistics

from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats

## Try all potential variables

In [133]:
df = pd.read_csv('./data/JV_data_dist.csv')

In [134]:
df.columns

Index(['da', 'id', 'activity', 'activityc', 'jvinc', 'jvindustry', 'jvstatus',
       'p', 'pbl', 'pbuss', 'sicp', 'sic', 'sicpdesc', 'psic', 'psicp', 'nump',
       'jvf', 'jvtype', 'rndf', 'pemp', 'pbussource', 'hitechc', 'crlic',
       'crtech', 'techniquec', 'techtr', 'ppubc', 'p1name', 'p1sicp', 'p2name',
       'p2sicp', 'p3name', 'p3sicp', 'p4name', 'p4sicp', 'p5name', 'p5sicp',
       'p6name', 'p6sicp', 'p7name', 'p7sicp', 'p8name', 'p8sicp', 'p9name',
       'p9sicp', 'pdynamic', 'ddist_int', 'ddist_abs', 'ddist_bin', 'p1emp',
       'p2emp', 'p3emp', 'p4emp', 'p5emp', 'p6emp', 'p7emp', 'p8emp', 'p9emp',
       'known_emp', 'known_allemp', 'avg_emp_pp', 'public_count'],
      dtype='object')

In [135]:
df['rndf'] = df['rndf'].map({"Yes": 1, "No": 0})
df['pdynamic'] = df['pdynamic'].map({'new_entrant': 1, 'incumbent': 0})

In [136]:
cols = ['rndf', 'ddist_bin','ddist_int', 'ddist_abs', 'avg_emp_pp', 'public_count', 'pdynamic']
df = df[cols].dropna()

In [138]:
cols = ['rndf', 'ddist_bin','ddist_int', 'ddist_abs', 'avg_emp_pp', 'public_count']
X = (df[cols]).astype(float)
y = df['pdynamic'].astype(float)
X_constant = sm.add_constant(X, prepend=False)

In [139]:
logit_model = GLM(y, X_constant, family=families.Binomial())
logit_results = logit_model.fit()
print(logit_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                  729
Model:                            GLM   Df Residuals:                      722
Model Family:                Binomial   Df Model:                            6
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -449.53
Date:                Tue, 14 Jun 2022   Deviance:                       899.06
Time:                        13:13:37   Pearson chi2:                     751.
No. Iterations:                     4   Pseudo R-squ. (CS):            0.07881
Covariance Type:            nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
rndf             0.4753      0.296      1.607   

## Filter only R&D alliances

In [152]:
df2 = pd.read_csv('./data/JV_data_dist.csv')

In [153]:
df2 = df2[df2['rndf'] == "Yes"]

In [154]:
df2['rndf'] = df2['rndf'].map({"Yes": 1, "No": 0})
df2['pdynamic'] = df2['pdynamic'].map({'new_entrant': 1, 'incumbent': 0})

In [159]:
cols2 = ['rndf', 'ddist_bin','ddist_int', 'ddist_abs', 'avg_emp_pp', 'public_count', 'pdynamic']
df2 = df2[cols2].dropna()

In [160]:
cols2 = ['ddist_bin','ddist_int', 'ddist_abs', 'avg_emp_pp', 'public_count']
X2 = (df2[cols2]).astype(float)
y2 = df2['pdynamic'].astype(float)
X2_constant = sm.add_constant(X2, prepend=False)

In [161]:
logit_model = GLM(y2, X2_constant, family=families.Binomial())
logit_results = logit_model.fit()
print(logit_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                   66
Model:                            GLM   Df Residuals:                       60
Model Family:                Binomial   Df Model:                            5
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -33.305
Date:                Tue, 14 Jun 2022   Deviance:                       66.611
Time:                        13:19:09   Pearson chi2:                     61.8
No. Iterations:                     5   Pseudo R-squ. (CS):             0.1741
Covariance Type:            nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
ddist_bin       -0.0651      1.194     -0.055   

## Filter for alliance primary SIC

df3 = 