In [51]:
import pandas as pd
import dask as dd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import statsmodels.formula.api as smf
import statistics

from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats

In [52]:
sns.set()

### Helper functions

In [53]:

import pandas as pd

def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(stats.pearsonr(df[r], df[c])[1], 4)
    return pvalues

In [54]:
df = pd.read_csv('./data/JV_data_dist.csv')

#Filter data
# df = df[df['mfgf'] == "Yes"]
# df = df[df['sic'].str.contains('3711') == True]
# df = df[df['public_count'] >= 1]
# df = df[df['nump'] == 2]
# df = df[df['avg_emp_pp'] >= 8]

#Recode variables
df['pdynamic'] = df['pdynamic'].map({'new_entrant': 1, 'incumbent': 0})
df['jvf'] = df['jvf'].map({'Yes': 1, 'No': 0})
df['snation_partal'] = df['snation_partal'].map({"Y": 1, "N": 0})
df['mfgf'] = df['mfgf'].map({"Yes": 1, "No": 0})
df['saf'] = df['saf'].map({"Y": 1, "N": 0})



## Correlations (no segmentation)

In [57]:
coll = ['ddist_year', 'pdynamic', 'jvf', 'avg_emp_pp', 'public_count', 'snation_partal', 'nump', 'mfgf', 'saf']

display(df[coll].corr(), calculate_pvalues(df[coll]))

Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,nump,mfgf,saf
ddist_year,1.0,0.00348,0.189488,-0.097809,-0.037129,0.017334,-0.053992,-0.011022,-0.189488
pdynamic,0.00348,1.0,-0.028112,-0.16486,-0.15929,0.140374,-0.031982,-0.1324,0.028112
jvf,0.189488,-0.028112,1.0,-0.151136,-0.012062,-0.066478,0.076198,0.259683,-1.0
avg_emp_pp,-0.097809,-0.16486,-0.151136,1.0,0.110587,-0.046406,-0.035239,-0.046848,0.151136
public_count,-0.037129,-0.15929,-0.012062,0.110587,1.0,0.083162,0.338668,-0.089192,0.012062
snation_partal,0.017334,0.140374,-0.066478,-0.046406,0.083162,1.0,0.048809,-0.069829,0.066478
nump,-0.053992,-0.031982,0.076198,-0.035239,0.338668,0.048809,1.0,-0.063561,-0.076198
mfgf,-0.011022,-0.1324,0.259683,-0.046848,-0.089192,-0.069829,-0.063561,1.0,-0.259683
saf,-0.189488,0.028112,-1.0,0.151136,0.012062,0.066478,-0.076198,-0.259683,1.0


Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,nump,mfgf,saf
ddist_year,0.0,0.6694,0.0,0.0082,0.4154,0.9743,0.1181,0.2843,0.0
pdynamic,0.6694,0.0,0.7514,0.0,0.0,0.0,0.6224,0.0003,0.7514
jvf,0.0,0.7514,0.0,0.0,0.6184,0.0729,0.1536,0.0,0.0
avg_emp_pp,0.0082,0.0,0.0,0.0,0.0028,0.2108,0.3421,0.2064,0.0
public_count,0.4154,0.0,0.6184,0.0028,0.0,0.0029,0.0,0.0436,0.6184
snation_partal,0.9743,0.0,0.0729,0.2108,0.0029,0.0,0.0907,0.1454,0.0729
nump,0.1181,0.6224,0.1536,0.3421,0.0,0.0907,0.0,0.0303,0.1536
mfgf,0.2843,0.0003,0.0,0.2064,0.0436,0.1454,0.0303,0.0,0.0
saf,0.0,0.7514,0.0,0.0,0.6184,0.0729,0.1536,0.0,0.0


### Correlations (segmentation)

In [58]:
#Filter data
dfS = df.copy()
dfS = dfS[dfS['mfgf'] == "Yes"]
dfS = dfS[dfS['sic'].str.contains('3711') == True]
dfS = dfS[dfS['public_count'] >= 1]
dfS = dfS[dfS['nump'] == 2]
dfS = dfS[dfS['avg_emp_pp'] >= 8]

In [59]:
coll = ['ddist_year', 'pdynamic', 'jvf', 'avg_emp_pp', 'public_count', 'snation_partal', 'saf']

display(df[coll].corr(), calculate_pvalues(df[coll]))

Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,saf
ddist_year,1.0,0.00348,0.189488,-0.097809,-0.037129,0.017334,-0.189488
pdynamic,0.00348,1.0,-0.028112,-0.16486,-0.15929,0.140374,0.028112
jvf,0.189488,-0.028112,1.0,-0.151136,-0.012062,-0.066478,-1.0
avg_emp_pp,-0.097809,-0.16486,-0.151136,1.0,0.110587,-0.046406,0.151136
public_count,-0.037129,-0.15929,-0.012062,0.110587,1.0,0.083162,0.012062
snation_partal,0.017334,0.140374,-0.066478,-0.046406,0.083162,1.0,0.066478
saf,-0.189488,0.028112,-1.0,0.151136,0.012062,0.066478,1.0


Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,saf
ddist_year,0.0,0.6694,0.0,0.0082,0.4154,0.9743,0.0
pdynamic,0.6694,0.0,0.7514,0.0,0.0,0.0,0.7514
jvf,0.0,0.7514,0.0,0.0,0.6184,0.0729,0.0
avg_emp_pp,0.0082,0.0,0.0,0.0,0.0028,0.2108,0.0
public_count,0.4154,0.0,0.6184,0.0028,0.0,0.0029,0.6184
snation_partal,0.9743,0.0,0.0729,0.2108,0.0029,0.0,0.0729
saf,0.0,0.7514,0.0,0.0,0.6184,0.0729,0.0
