In [1]:
import pandas as pd
import dask as dd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import statsmodels.formula.api as smf
import statistics

from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod import families
from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy import stats

  from pandas import Int64Index as NumericIndex


In [2]:
sns.set()

## Filter out non automotive-related alliances

In [3]:
df = pd.read_csv('../data.csv')
df = df[df['psic'].notna()] #drop 45 NaN participant SIC, fix the issue by not solving it
autoDF = df[df['psicp'].str.contains('3711')] #Create df who have at least one participant in the automotive industry with SIC: 3711
autoDF = autoDF.drop('Unnamed: 0', axis=1)
autoDF = autoDF.reset_index(drop=True)

#Export
autoDF.to_csv('./data/autodf.csv', index=False) #create csv sample

  df = pd.read_csv('../data.csv')


## Data selection

### Filter variables

In [4]:
variables = ["id","activity", "activityc", "da", "jvinc", "jvindustry", "jvstatus", "p", "pbl", 
		   "pbuss", "sicp", "sic", "SICPDESC".lower(), "psic", "psicp", "nump", "jvf", "jvtype", 
		   "rndf", "pemp", "PBUSSOURCE".lower(), "HITECHC".lower(), "crlic", "crtech", "TECHNIQUEC".lower(),
		   "TECHTR".lower(), "PPUBC".lower()]
filteredDF = autoDF[variables]

### Select timeframe

In [5]:
filteredDF['da'] = pd.to_datetime(filteredDF['da']) 
timedDF = filteredDF[(filteredDF['da'] >= '2002-01-01') & (filteredDF['da'] <= '2012-12-31')]
timedDF.set_index('da', inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filteredDF['da'] = pd.to_datetime(filteredDF['da'])


## Feature engineering

### Split PSICPs to seperate columns

In [6]:
pattern = r'\\n' #Regex pattern to split name

for n in range(1, (int(timedDF["nump"].max()) + 1)): 
	timedDF['p' + str(n) + "name"] =  timedDF['p'].str.split(pattern, expand=True)[n-1] #Create seperate columns for each possible participant

	timedDF["p" + str(n) + "sicp"] = timedDF['psicp'].str.split(pattern, expand=True)[n-1] #Create seperate placeholder columns to split participant SIC codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  timedDF['p' + str(n) + "name"] =  timedDF['p'].str.split(pattern, expand=True)[n-1] #Create seperate columns for each possible participant
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  timedDF["p" + str(n) + "sicp"] = timedDF['psicp'].str.split(pattern, expand=True)[n-1] #Create seperate placeholder columns to split participant SIC codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas

### Define incumbent / new-entrant alliance

In [7]:
cols = timedDF.filter(regex='p\d+sicp', axis=1).columns #Get columns of individual participant SICs

timedDF['pdynamic'] = (timedDF[cols]=='3711').sum(axis=1) > 1 #Flag True for alliances consisting of more than one automotive incumbent
timedDF['pdynamic'] = timedDF['pdynamic'].map({True: "incumbent", False: "new_entrant"}) #Turn tha above from boolean to string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  timedDF['pdynamic'] = (timedDF[cols]=='3711').sum(axis=1) > 1 #Flag True for alliances consisting of more than one automotive incumbent
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  timedDF['pdynamic'] = timedDF['pdynamic'].map({True: "incumbent", False: "new_entrant"}) #Turn tha above from boolean to string


In [8]:
#Export
timedDF.to_csv('./data/JV_data_2002-2012.csv')

### Generate discontintuiy time distance variations

In [9]:
JV_data_dist = timedDF.copy() ##Make copy

disc_day = pd.to_datetime("02/06/2007", format="%d/%m/%Y")

#### Integer distance

In [10]:
JV_data_dist['ddist_int'] = ((disc_day - JV_data_dist.index).days) * -1 #time -1 to reverse days count, negative should be before 2007

#### Natural num distance (absolute)

In [11]:
JV_data_dist['ddist_abs'] = abs((disc_day - JV_data_dist.index).days)

#### Binary distance

In [12]:
#Let 0 describe alliances before discontinuity day
#Let 1 describe alliances after discontinuity day

JV_data_dist['ddist_bin'] = 0 #Create placeholder columnc
JV_data_dist.loc[JV_data_dist.index > disc_day, 'ddist_bin'] = 1

### Split participant employee count to seperate variables

In [51]:
pattern = r'\\n'
pattern2 = r'p\d+emp'

for n in range(1, (int(JV_data_dist["nump"].max()) + 1)): 
	JV_data_dist["p" + str(n) + "emp"] =  pd.to_numeric(JV_data_dist['pemp'].str.split(pattern, expand=True)[n-1]) #Create seperate columns for each possible participant
	JV_data_dist["p" + str(n) + "emp"].replace(to_replace='', value=None, inplace=True) #Replace unknown employee count with na

foo = JV_data_dist.filter(regex=pattern2, axis=1).notna() #Get columns of individual participant employee count

JV_data_dist['known_emp'] = foo.any(axis=1) #Binary if employee count is known for >= participant
JV_data_dist['known_allemp'] = JV_data_dist['nump'] == foo.sum(axis=1) #Binary if employee count is known for ALL participants

#Get log of AVG employess per participant
JV_data_dist['avg_emp_pp'] = np.log(JV_data_dist.filter(regex=pattern2, axis=1).mean(axis=1, skipna=True))

### Count public companies per alliance

In [60]:
pattern = r'\\n'

JV_data_dist['puplic_count'] = (JV_data_dist['ppubc'].str.split(pattern, expand=True) == 'P').sum(axis=1)

In [61]:
#Export
JV_data_dist.to_csv("./data/JV_data_dist.csv")

In [62]:
pd.options.display.max_columns = None
JV_data_dist

Unnamed: 0_level_0,id,activity,activityc,jvinc,jvindustry,jvstatus,p,pbl,pbuss,sicp,sic,sicpdesc,psic,psicp,nump,jvf,jvtype,rndf,pemp,pbussource,hitechc,crlic,crtech,techniquec,techtr,ppubc,p1name,p1sicp,p2name,p2sicp,p3name,p3sicp,p4name,p4sicp,p5name,p5sicp,p6name,p6sicp,p7name,p7sicp,p8name,p8sicp,p9name,p9sicp,pdynamic,ddist_int,ddist_abs,ddist_bin,p1emp,p2emp,p3emp,p4emp,p5emp,p6emp,p7emp,p8emp,p9emp,known_emp,known_allemp,avg_emp_pp,puplic_count
da,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
2002-01-01,88623.0,Financial Services,FIN,DC,Credit Institutions,Completed/Signed,Toyota Motor Corp\nMetrobank,"Toyota Motor Corp,\nheadquartered in Aichi, Ja...",Mnfr passenger motor vehicles\nBank,6141,6141\n6153\n6159,,3711\n3714\n6000,3711\n6000,2.0,Yes,NF,No,69478\n12114,Asian Wall Street Journal\nDuns Asia/Pacific-K...,0,N,N,,N,P\nP,Toyota Motor Corp,3711,Metrobank,6000,,,,,,,,,,,,,,,new_entrant,-1978,1978,0,69478.0,12114.0,,,,,,,,True,True,10.616339,2
2002-01-03,88649.0,Manufacturing Services,MNF,AN,Machinery,Completed/Signed,SAIC\nYuejin Automobile Group Co\nTeksid SpA,"Shanghai Automotive Industry\nCorp (Group), lo...","Mnfr,whl motor vehicles,parts\nMnfr motor vehi...",3561,3561\n3593\n3491\n3714\n2796,,3711\n3537\n3714\n6141\n6153\n6159\n7538\n5012...,3711\n3711\n3312,3.0,Yes,NF,No,\n\n,"Reuters\nEstimation\nDun's 10,000 Italian Cos",0,N,N,,N,V\nV\nS,SAIC,3711,Yuejin Automobile Group Co,3711,Teksid SpA,3312,,,,,,,,,,,,,incumbent,-1976,1976,0,,,,,,,,,,False,False,,0
2002-01-04,88660.0,Property Development Services\nConstruction Se...,PDS\nCSN\nREI,DD,Real Estate; Mortgage Bankers and Brokers,Letter of Intent,Raba Automotive Holding Plc\nEngel General Dev...,"Raba Automotive Holding Plc,\nlocated in Gyor,...","Mnfr,whl commercial vehicles\nRE dvlp firm",6552,6552\n6512,,3711\n3713\n3465\n3519\n3714\n3621\n3511\n5012...,3711\n6552,2.0,Yes,NF,No,\n,,0,N,N,,N,P\nP,Raba Automotive Holding Plc,3711,Engel General Developers Ltd,6552,,,,,,,,,,,,,,,new_entrant,-1975,1975,0,,,,,,,,,,False,False,,2
2002-01-08,88691.0,Manufacturing Services,MNF,AS,Transportation Equipment,Completed/Signed,Hyundai Corp\nMitsubishi Corp\nDaimlerChrysler AG,"Hyundai Corp, located in\nSeoul, South Korea, ...","Whl automobiles,metal prods\nMnfr,whl auto,che...",3711,3711,,5051\n5012\n5084\n5191\n5088\n5065\n5169\n6719...,5051\n2899\n3711,3.0,No,,No,267\n60520\n382724,Asian Wall Street Journal\nDuns Asia/Pacific-K...,0,N,N,,N,P\nP\nP,Hyundai Corp,5051,Mitsubishi Corp,2899,DaimlerChrysler AG,3711,,,,,,,,,,,,,new_entrant,-1971,1971,0,267.0,60520.0,382724.0,,,,,,,True,True,11.903866,3
2002-01-08,88699.0,Manufacturing Services,MNF,AS,Transportation Equipment,Pending,Proton\nGold Star Heavy Ind Mnfr,Manufacture and retail motor\nvehicles; wholes...,"Mnfr,ret motor vehicles\nMnfr auto parts",3714,3714,,3711\n5511\n5013\n3714,3711\n3714,2.0,No,,No,\n,,0,N,N,,N,P\nV,Proton,3711,Gold Star Heavy Ind Mnfr,3714,,,,,,,,,,,,,,,new_entrant,-1971,1971,0,,,,,,,,,,False,False,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-02,135098.0,Automotive Services,AUT,AS,Transportation Equipment,Completed/Signed,GAZ\nMersa Otomotiv Motorlu Araclar,"GAZ OAO, located in Nizhnii\nNovgorod, Russian...","Manufacture,whl motor vehicles\nMnfr,wholesale...",3711,3711\n5012,,3711\n5012\n3711\n5012,3711\n3711,2.0,No,,No,\n,,0,N,N,TT,Y,P\nV,GAZ,3711,Mersa Otomotiv Motorlu Araclar,3711,,,,,,,,,,,,,,,incumbent,2010,2010,1,,,,,,,,,,False,False,,1
2012-12-05,135144.0,Manufacturing Services\nRetail & Wholesale Ser...,MNF\nRWS,AS,Transportation Equipment,Pending,SAIC Motor Corp Ltd\nCharoen Pokphand Group Co...,"SAIC Motor Corp Ltd, located\nin Shanghai, Chi...","Mnfr,whl motor vehicles,parts\nOwn,operate gen...",3711,3711\n3714\n5012\n5013,,3711\n3714\n3713\n5013\n5012\n6159\n0273\n0913...,3711\n0273,2.0,Yes,NF,No,65000\n280000,,0,N,N,,N,P\nV,SAIC Motor Corp Ltd,3711,Charoen Pokphand Group Co Ltd,0273,,,,,,,,,,,,,,,new_entrant,2013,2013,1,65000.0,280000.0,,,,,,,,True,True,12.058153,1
2012-12-11,135218.0,Manufacturing Services,MNF,AS,Transportation Equipment,Pending,BYD Co Ltd\nBulmineral Ltd,"BYD Co Ltd located in\nShenzhen, China, manufa...","Mnfr,whl automobiles,parts\nInvestment company",3711,3711,,3711\n3691\n3692\n3648\n3641\n3825\n3812\n5013...,3711\n6799,2.0,Yes,NF,No,160000\n,,0,N,N,,N,P\nV,BYD Co Ltd,3711,Bulmineral Ltd,6799,,,,,,,,,,,,,,,new_entrant,2019,2019,1,160000.0,,,,,,,,,True,False,11.982929,1
2012-12-18,135299.0,Retail & Wholesale Services,RWS,CA,Wholesale Trade-Durable Goods,Pending,Daimler AG\nBAIC,"Daimler AG, located in\nStuttgart, Germany,\nm...","Manufacture, wholesale passenger cars\nMnfr,wh...",5012,5012\n5013,,3711\n3537\n5012\n6799\n6719\n3714\n3711\n3537...,3711\n3714,2.0,Yes,NF,No,280829\n,,0,N,N,,N,P\nV,Daimler AG,3711,BAIC,3714,,,,,,,,,,,,,,,new_entrant,2026,2026,1,280829.0,,,,,,,,,True,False,12.545501,1
