In [44]:
import pandas as pd
import dask as dd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import statsmodels.formula.api as smf
import statistics

from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats

In [75]:
def basePrep(df):
	df['rndf'] = df['rndf'].map({"Yes": 1, "No": 0})
	df['pdynamic'] = df['pdynamic'].map({'new_entrant': 1, 'incumbent': 0})	
	df["SNATION_PARTAL".lower()] = df["SNATION_PARTAL".lower()].map({"Y": 1, "N": 0})
	df['cr_bor_part'] = df['cr_bor_part'].map({"Y": 1, "N": 0})
	df['mfgf'] = df['mfgf'].map({"Yes": 1, "No": 0})
	df['jvf'] = df['jvf'].map({'Yes': 1, 'No': 0})
	
	cols = ['rndf', 'jvf','ddist_bin','ddist_int', 'ddist_abs', 'ddist_year',
		'avg_emp_pp', 'public_count', 'pdynamic', "SNATION_PARTAL".lower(), 
		'cr_bor_part', 'mfgf', 'avg_emp_pp_log']
	df = df[cols].dropna()
	
	return df

def variation(df):
	cols = ['rndf', 'ddist_bin','ddist_int', 'ddist_abs', 'ddist_year', 'avg_emp_pp', 'public_count', 
			"SNATION_PARTAL".lower(), 'cr_bor_part', 'mfgf']
	X = (df[cols]).astype(float)
	y = df['pdynamic'].astype(float)
	X_constant = sm.add_constant(X, prepend=False)

	logit_model = GLM(y, X_constant, family=families.Binomial())
	logit_results = logit_model.fit()
	print(logit_results.summary())

## Try all potential variables

In [76]:
df = pd.read_csv('./data/JV_data_dist.csv')
df = basePrep(df)

variation(df)

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                  729
Model:                            GLM   Df Residuals:                      718
Model Family:                Binomial   Df Model:                           10
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -433.13
Date:                Thu, 16 Jun 2022   Deviance:                       866.26
Time:                        17:58:18   Pearson chi2:                     740.
No. Iterations:                    19   Pseudo R-squ. (CS):             0.1193
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
rndf               0.2749      0.310      0.

## Filter only R&D alliances

In [77]:
df2 = pd.read_csv('./data/JV_data_dist.csv')

df2 = df2[df2['rndf'] == "Yes"]
df2 = basePrep(df2)

variation(df2)

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                   66
Model:                            GLM   Df Residuals:                       57
Model Family:                Binomial   Df Model:                            8
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -33.231
Date:                Thu, 16 Jun 2022   Deviance:                       66.461
Time:                        17:58:27   Pearson chi2:                     61.9
No. Iterations:                     5   Pseudo R-squ. (CS):             0.1759
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
rndf               2.5577      0.991      2.

## Filter for alliance SIC

In [78]:
df3 = pd.read_csv("./data/JV_data_dist.csv")

#Select alliances that at least part of their goal was manufacturing of cars
df3 = df3[df3['sic'].str.contains('3711') == True]
df3 = basePrep(df3)

variation(df3)

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                  311
Model:                            GLM   Df Residuals:                      301
Model Family:                Binomial   Df Model:                            9
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -187.81
Date:                Thu, 16 Jun 2022   Deviance:                       375.63
Time:                        17:58:35   Pearson chi2:                     323.
No. Iterations:                     4   Pseudo R-squ. (CS):             0.1551
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
rndf              -0.5267      0.696     -0.

## Filter for same nation participants and alliance

In [79]:
df4 = pd.read_csv("./data/JV_data_dist.csv")

#Select alliances that at least part of their goal was manufacturing of cars
df4 = df4[df4['snation_partal'] == "Y"]
df4 = basePrep(df4)

variation(df4)

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                  154
Model:                            GLM   Df Residuals:                      145
Model Family:                Binomial   Df Model:                            8
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -74.801
Date:                Thu, 16 Jun 2022   Deviance:                       149.60
Time:                        17:58:52   Pearson chi2:                     155.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.06547
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
rndf              -0.5432      0.661     -0.

## Filter for Manufacturing agreement flags

In [80]:
df5 = pd.read_csv("./data/JV_data_dist.csv")

#Select alliances that at least part of their goal was manufacturing of cars
df5 = df5[df5['mfgf'] == "Yes"]
df5 = basePrep(df5)

variation(df5)

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                  481
Model:                            GLM   Df Residuals:                      472
Model Family:                Binomial   Df Model:                            8
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -295.71
Date:                Thu, 16 Jun 2022   Deviance:                       591.41
Time:                        17:58:57   Pearson chi2:                     491.
No. Iterations:                     4   Pseudo R-squ. (CS):             0.1191
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
rndf               0.6016      0.483      1.

In [81]:
df6 = pd.read_csv("./data/JV_data_dist.csv")

#Select alliances that at least part of their goal was manufacturing of cars
df6 = df6[df6['mfgf'] == "Yes"]
df6 = df6[df6['sic'].str.contains('3711') == True]
df6 = basePrep(df6)

variation(df6)

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                  267
Model:                            GLM   Df Residuals:                      258
Model Family:                Binomial   Df Model:                            8
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -162.41
Date:                Thu, 16 Jun 2022   Deviance:                       324.81
Time:                        17:59:01   Pearson chi2:                     278.
No. Iterations:                     4   Pseudo R-squ. (CS):             0.1475
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
rndf              -0.2752      0.831     -0.

In [82]:
df7 = pd.read_csv("./data/JV_data_dist.csv")

#Select alliances that at least part of their goal was manufacturing of cars
df7 = df7[df7['mfgf'] == "Yes"]
df7 = df7[df7['sic'].str.contains('3711') == True]
df7 = df7[df7['public_count'] >= 1]
df7 = df7[df7['nump'] == 2]
df7 = basePrep(df7)

variation(df7)

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                  202
Model:                            GLM   Df Residuals:                      193
Model Family:                Binomial   Df Model:                            8
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -119.68
Date:                Thu, 16 Jun 2022   Deviance:                       239.36
Time:                        17:59:07   Pearson chi2:                     204.
No. Iterations:                     4   Pseudo R-squ. (CS):             0.1665
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
rndf              -0.6103      1.294     -0.

## Filter for single country

In [83]:
df8 = pd.read_csv("./data/JV_data_dist.csv")

#Select alliances that at least part of their goal was manufacturing of cars
df8 = df8[df8['natc'] == "IN"]

df8 = basePrep(df8)

variation(df8)

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                   76
Model:                            GLM   Df Residuals:                       66
Model Family:                Binomial   Df Model:                            9
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -30.780
Date:                Thu, 16 Jun 2022   Deviance:                       61.560
Time:                        17:59:11   Pearson chi2:                 1.60e+03
No. Iterations:                     6   Pseudo R-squ. (CS):             0.3085
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
rndf              -1.8520      1.691     -1.

## Test for final H1

In [88]:
dfH = pd.read_csv("./data/JV_data_dist.csv")

dfH = dfH[dfH['mfgf'] == "Yes"]
dfH = dfH[dfH['sic'].str.contains('3711') == True]
dfH = dfH[dfH['public_count'] >= 1]
dfH = dfH[dfH['nump'] == 2]
dfH = dfH[dfH['avg_emp_pp'] >= 8]
dfH = basePrep(dfH)

cols = ['ddist_year']
X = (dfH[cols]).astype(float)
y = dfH['pdynamic'].astype(float)
X_constant = sm.add_constant(X, prepend=False)

logit_model = GLM(y, X_constant, family=families.Binomial())
logit_results = logit_model.fit()
print(logit_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               pdynamic   No. Observations:                  202
Model:                            GLM   Df Residuals:                      200
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -130.98
Date:                Thu, 16 Jun 2022   Deviance:                       261.95
Time:                        18:00:15   Pearson chi2:                     202.
No. Iterations:                     4   Pseudo R-squ. (CS):            0.06781
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ddist_year     0.1629      0.045      3.637      0.0

## Test for final H2

In [84]:
dfH2 = pd.read_csv("./data/JV_data_dist.csv")

dfH2 = dfH2[dfH2['mfgf'] == "Yes"]
dfH2 = dfH2[dfH2['sic'].str.contains('3711') == True]
dfH2 = dfH2[dfH2['public_count'] >= 1]
dfH2 = dfH2[dfH2['nump'] == 2]
dfH2 = dfH2[dfH2['avg_emp_pp'] >=1000]

dfH2 = basePrep(dfH2)

cols = ['ddist_int', 'public_count', 'rndf', 'snation_partal', 'avg_emp_pp', 'avg_emp_pp_log']
X = (dfH2[cols]).astype(float)
y = dfH2['jvf'].astype(float)
X_constant = sm.add_constant(X, prepend=False)

logit_model = GLM(y, X_constant, family=families.Binomial())
logit_results = logit_model.fit()
print(logit_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    jvf   No. Observations:                  194
Model:                            GLM   Df Residuals:                      187
Model Family:                Binomial   Df Model:                            6
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -103.16
Date:                Thu, 16 Jun 2022   Deviance:                       206.33
Time:                        17:59:27   Pearson chi2:                     194.
No. Iterations:                     4   Pseudo R-squ. (CS):             0.1035
Covariance Type:            nonrobust                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
ddist_int          0.0005      0.000      3.