In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as lr
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [22]:
salary = pd.read_csv('CEOData 1992-2018.csv')
salary = salary.rename(columns={"TOTAL_ALT1":"STOCKOPT"})
company = pd.read_csv('CompanyData 1950-2018.csv')
sector = pd.read_csv('Companies.csv')


joined = pd.merge(salary,       # the "left" dataframe
                        company,        # the "right" dataframe
                        how = 'left',   # which observations to keep? Here we are specifying that we keep the "left" dataset
                        on = ['GVKEY','YEAR']  # the join key
        )
joined = joined.drop(columns=["TICKER_x","DATE"])
joined = joined.rename(columns = {"TICKER_y":"TICKER"})


df = pd.merge(joined,       # the "left" dataframe
                    sector,        # the "right" dataframe
                    how = 'left',   # which observations to keep? Here we are specifying that we keep the "left" dataset
                    on = ['TICKER']  # the join key
        )
df = df.sort_values("STATE")
df['STATEID'] = pd.factorize(df['STATE'])[0] + 1
#format strings into datetime
df['BECAMECEO'] = pd.to_datetime(df['BECAMECEO'], format='%d/%m/%Y')
df['LEFTOFC'] = pd.to_datetime(df['LEFTOFC'], format='%d/%m/%Y')
# Extract the year from the "Date" column and store it in a new "Year" column
df['BECAMECEOYE'] = df['BECAMECEO'].dt.year
df['LEFTOFCYE'] = df['LEFTOFC'].dt.year

#ensuring the observations are within the serving term of each ceo at the time
df = df[((df['YEAR'] >= df['BECAMECEOYE']) & (df['YEAR'] <= df['LEFTOFCYE'])) | ((df['YEAR'] >= df['BECAMECEOYE']) & (df['LEFTOFCYE'].isna()))]

df = df.dropna(subset=['BECAMECEO', 'STATE', 'TICKER'])

df = df.sort_values(by=['TICKER', 'YEAR'])

df['SIMPLE_RETURN'] = df.groupby('TICKER')['PRICEC'].pct_change()
df = df[df['SIMPLE_RETURN'] != 0]

df = df.sort_values(by=['CO_PER_ROL', 'YEAR'])
df['SALCH'] = df.groupby('CO_PER_ROL')['SALARY'].pct_change()
df['STOCH'] = df.groupby('CO_PER_ROL')['STOCKOPT'].pct_change()

df['SALARY_THOU'] = df['SALARY']/100 
df['STOCKOPT_MIL'] = df['STOCKOPT']/1000
df['RETURNS'] = df['SIMPLE_RETURN']*100
df['SAL_CH_PER'] = df['SALCH']*100
df['STO_CH_PER'] = df['STOCH']*100

df

Unnamed: 0,CO_PER_ROL,SALARY,STOCKOPT,GVKEY,YEAR,BECAMECEO,LEFTOFC,TITLE,EXEC_LNAME,EXEC_FNAME,...,BECAMECEOYE,LEFTOFCYE,SIMPLE_RETURN,SALCH,STOCH,SALARY_THOU,STOCKOPT_MIL,RETURNS,SAL_CH_PER,STO_CH_PER
160,6,761.535,,1078,1992,1989-12-01,1998-12-31,chmn.,Burnham,Duane,...,1989.0,1998.0,,,,7.61535,,,,
165,6,772.615,,1078,1993,1989-12-01,1998-12-31,chmn.,Burnham,Duane,...,1989.0,1998.0,-0.024691,0.014550,,7.72615,,-2.469136,1.454956,
171,6,794.269,,1078,1994,1989-12-01,1998-12-31,chmn.,Burnham,Duane,...,1989.0,1998.0,0.101266,0.028027,,7.94269,,10.126582,2.802690,
177,6,818.269,,1078,1995,1989-12-01,1998-12-31,chmn.,Burnham,Duane,...,1989.0,1998.0,0.275862,0.030216,,8.18269,,27.586207,3.021646,
184,6,846.923,,1078,1996,1989-12-01,1998-12-31,chmn.,Burnham,Duane,...,1989.0,1998.0,0.219219,0.035018,,8.46923,,21.921922,3.501782,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49553,63380,831.731,4130.910,28477,2015,2013-05-19,NaT,,Teich,Andrew,...,2013.0,,-0.131229,0.144544,1.022649,8.31731,4.130910,-13.122872,14.454404,102.264867
49558,63380,835.731,5737.770,28477,2016,2013-05-19,NaT,,Teich,Andrew,...,2013.0,,0.289277,0.004809,0.388985,8.35731,5.737770,28.927681,0.480925,38.898451
48101,66910,591.667,4085.956,27638,2016,2016-11-01,NaT,"President, CEO & Director",Harvey,Roy,...,2016.0,,,,,5.91667,4.085956,,,
48106,66910,925.000,9869.766,27638,2017,2016-11-01,NaT,"President, CEO & Director",Harvey,Roy,...,2016.0,,0.918447,0.563379,1.415534,9.25000,9.869766,91.844729,56.337940,141.553409


In [23]:
df.to_csv(r'C:\Users\yuxua\Desktop\ECON 494\Project Draft\Merged Data t.csv', index = False)