In [98]:
import pandas as pd
import dask as dd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import statsmodels.formula.api as smf
import statistics

from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats
import docx

In [52]:
sns.set()

### Helper functions

In [118]:

import pandas as pd

def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(stats.pearsonr(df[r], df[c])[1], 4)
    return pvalues

def toWord(df):

	df['id'] = df.index

	doc = docx.Document()

	# add a table to the end and create a reference variable
	# extra row is so we can add the header row
	t = doc.add_table(df.shape[0]+1, df.shape[1])

	# add the header rows.
	for j in range(df.shape[-1]):
		t.cell(0,j).text = df.columns[j]

	# add the rest of the data frame
	for i in range(df.shape[0]):
		for j in range(df.shape[-1]):
			t.cell(i+1,j).text = str(df.values[i,j])

	# save the doc
	doc.save('./test.docx')

def basePrep(df):
	df['rndf'] = df['rndf'].map({"Yes": 1, "No": 0})
	df['pdynamic'] = df['pdynamic'].map({'new_entrant': 1, 'incumbent': 0})	
	df["SNATION_PARTAL".lower()] = df["SNATION_PARTAL".lower()].map({"Y": 1, "N": 0})
	df['cr_bor_part'] = df['cr_bor_part'].map({"Y": 1, "N": 0})
	df['saf'] = df['saf'].map({"Y": 1, "N": 0})
	df['mfgf'] = df['mfgf'].map({"Yes": 1, "No": 0})
	df['jvf'] = df['jvf'].map({'Yes': 1, 'No': 0})
	df['natc_n'] = df['natc'].astype('category').cat.codes
	
	cols = ['rndf', 'jvf','ddist_bin','ddist_int', 'ddist_abs', 'ddist_year',
		'avg_emp_pp', 'public_count', 'pdynamic', "SNATION_PARTAL".lower(), 
		'cr_bor_part', 'mfgf', 'avg_emp_pp_log', 'saf', 'natc_n', 'nump']
	df = df[cols].dropna()
	
	return df


## Correlations (no segmentation)

In [119]:
df = pd.read_csv('./data/JV_data_dist.csv')

df = basePrep(df)

coll = ['ddist_year', 'pdynamic', 'jvf', 'avg_emp_pp', 'public_count', 'snation_partal', 'nump', 'mfgf', 'saf']

display(df[coll].corr(), 
	   calculate_pvalues(df[coll]),
	   df[coll].describe().transpose())


#Export desc to word
descrs = df[coll].describe().transpose().round(decimals=2)
descrs['id'] = descrs.index

toWord(descrs)

#Export correlations to word
corrs = df[coll].corr()

toWord(corrs)

Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,nump,mfgf,saf
ddist_year,1.0,0.015841,0.190978,-0.097809,-0.030207,0.001196,-0.057926,-0.039706,-0.190978
pdynamic,0.015841,1.0,-0.011752,-0.16486,-0.199703,0.164587,-0.01827,-0.133139,0.011752
jvf,0.190978,-0.011752,1.0,-0.151136,-0.018481,-0.066463,0.052901,0.201727,-1.0
avg_emp_pp,-0.097809,-0.16486,-0.151136,1.0,0.110587,-0.046406,-0.035239,-0.046848,0.151136
public_count,-0.030207,-0.199703,-0.018481,0.110587,1.0,0.110167,0.389393,-0.074749,0.018481
snation_partal,0.001196,0.164587,-0.066463,-0.046406,0.110167,1.0,0.062696,-0.053981,0.066463
nump,-0.057926,-0.01827,0.052901,-0.035239,0.389393,0.062696,1.0,-0.080254,-0.052901
mfgf,-0.039706,-0.133139,0.201727,-0.046848,-0.074749,-0.053981,-0.080254,1.0,-0.201727
saf,-0.190978,0.011752,-1.0,0.151136,0.018481,0.066463,-0.052901,-0.201727,1.0


Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,nump,mfgf,saf
ddist_year,0.0,0.6694,0.0,0.0082,0.4154,0.9743,0.1181,0.2843,0.0
pdynamic,0.6694,0.0,0.7514,0.0,0.0,0.0,0.6224,0.0003,0.7514
jvf,0.0,0.7514,0.0,0.0,0.6184,0.0729,0.1536,0.0,0.0
avg_emp_pp,0.0082,0.0,0.0,0.0,0.0028,0.2108,0.3421,0.2064,0.0
public_count,0.4154,0.0,0.6184,0.0028,0.0,0.0029,0.0,0.0436,0.6184
snation_partal,0.9743,0.0,0.0729,0.2108,0.0029,0.0,0.0907,0.1454,0.0729
nump,0.1181,0.6224,0.1536,0.3421,0.0,0.0907,0.0,0.0303,0.1536
mfgf,0.2843,0.0003,0.0,0.2064,0.0436,0.1454,0.0303,0.0,0.0
saf,0.0,0.7514,0.0,0.0,0.6184,0.0729,0.1536,0.0,0.0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ddist_year,729.0,-0.046639,3.324983,-5.0,-3.0,0.0,3.0,5.0
pdynamic,729.0,0.632373,0.48249,0.0,0.0,1.0,1.0,1.0
jvf,729.0,0.683128,0.465577,0.0,0.0,1.0,1.0,1.0
avg_emp_pp,729.0,90566.626292,105789.5405,13.0,14940.0,48460.0,136552.5,572800.0
public_count,729.0,1.441701,0.792974,0.0,1.0,1.0,2.0,5.0
snation_partal,729.0,0.211248,0.408474,0.0,0.0,0.0,0.0,1.0
nump,729.0,2.245542,0.653654,2.0,2.0,2.0,2.0,9.0
mfgf,729.0,0.659808,0.474099,0.0,0.0,1.0,1.0,1.0
saf,729.0,0.316872,0.465577,0.0,0.0,0.0,1.0,1.0


### Correlations (segmentation)

In [91]:
#Filter data
dfS = pd.read_csv('./data/JV_data_dist.csv')

dfS = dfS[dfS['mfgf'] == "Yes"]
dfS = dfS[dfS['sic'].str.contains('3711') == True]
dfS = dfS[dfS['public_count'] >= 1]
dfS = dfS[dfS['nump'] == 2]
dfS = dfS[dfS['avg_emp_pp'] >= 8]

dfS = basePrep(dfS)

In [93]:
coll = ['ddist_year', 'pdynamic', 'jvf', 'avg_emp_pp', 'public_count', 'snation_partal', 'saf']

display(dfS[coll].corr(), 
	   calculate_pvalues(dfS[coll]),
	   dfS[coll].describe())

Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,saf
ddist_year,1.0,0.261651,0.274899,-0.161431,-0.043442,0.047834,-0.274899
pdynamic,0.261651,1.0,0.136659,-0.219656,-0.224911,0.128937,-0.136659
jvf,0.274899,0.136659,1.0,-0.0769,0.071746,-0.043139,-1.0
avg_emp_pp,-0.161431,-0.219656,-0.0769,1.0,0.029235,-0.088179,0.0769
public_count,-0.043442,-0.224911,0.071746,0.029235,1.0,-0.054254,-0.071746
snation_partal,0.047834,0.128937,-0.043139,-0.088179,-0.054254,1.0,0.043139
saf,-0.274899,-0.136659,-1.0,0.0769,-0.071746,0.043139,1.0


Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,saf
ddist_year,0.0,0.0002,0.0001,0.0217,0.5393,0.499,0.0001
pdynamic,0.0002,0.0,0.0525,0.0017,0.0013,0.0674,0.0525
jvf,0.0001,0.0525,0.0,0.2767,0.3103,0.5421,0.0
avg_emp_pp,0.0217,0.0017,0.2767,0.0,0.6796,0.2121,0.2767
public_count,0.5393,0.0013,0.3103,0.6796,0.0,0.4432,0.3103
snation_partal,0.499,0.0674,0.5421,0.2121,0.4432,0.0,0.5421
saf,0.0001,0.0525,0.0,0.2767,0.3103,0.5421,0.0


Unnamed: 0,ddist_year,pdynamic,jvf,avg_emp_pp,public_count,snation_partal,saf
count,202.0,202.0,202.0,202.0,202.0,202.0,202.0
mean,0.262376,0.430693,0.722772,89362.170792,1.450495,0.153465,0.277228
std,3.415719,0.496403,0.448742,103808.221557,0.498779,0.361331,0.448742
min,-5.0,0.0,0.0,13.0,1.0,0.0,0.0
25%,-3.0,0.0,0.0,15978.25,1.0,0.0,0.0
50%,0.0,0.0,1.0,41744.25,1.0,0.0,0.0
75%,4.0,1.0,1.0,133854.0,2.0,0.0,1.0
max,5.0,1.0,1.0,572800.0,2.0,1.0,1.0
