In [164]:
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
from config import db_password

In [165]:
# Create columns for provisional database

df = pd.DataFrame(columns = ["Year", "Ticker", "Company", "Sector", "numEmployees", "netIncome", "netIncome_per_emp", "grossProfit", "grossProfit_per_emp", "operatingIncome", "operatingIncome_per_emp", "totalRevenue", "totalRevenue_per_emp", "totalOperatingExpenses", "totalOperatingExpenses_per_emp", "mostly_remote"])

In [166]:
# Sample

df

Unnamed: 0,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,operatingIncome,operatingIncome_per_emp,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote


In [167]:
# Import 2021 employee headcount data

df2 = pd.read_csv('data/employee_data/snp_500_companies_employees.csv', thousands = ",", encoding='windows-1252')

# Create tickers list

tickers = df2["Ticker"]

# Create companies list

companies = df2["Company"]

# Create Sectors list

sectors = df2["Sector"]

# Create num_employees list

num_employees = df2["numEmployees"]


In [168]:
# Set provisional database columns' values to respective lists

df["Ticker"] = tickers

df["Company"] = companies

df["Year"] = 2021

df["Sector"] = sectors

df["numEmployees"] = num_employees



In [169]:
# Convert necessary columns to number

df["netIncome"] = pd.to_numeric(df.netIncome)

df["grossProfit"] = pd.to_numeric(df.grossProfit)

df["operatingIncome"] = pd.to_numeric(df.operatingIncome)

df["totalRevenue"] = pd.to_numeric(df.totalRevenue)

df["totalOperatingExpenses"] = pd.to_numeric(df.totalOperatingExpenses)

In [170]:
df.dtypes

Year                                int64
Ticker                             object
Company                            object
Sector                             object
numEmployees                        int64
netIncome                         float64
netIncome_per_emp                  object
grossProfit                       float64
grossProfit_per_emp                object
operatingIncome                   float64
operatingIncome_per_emp            object
totalRevenue                      float64
totalRevenue_per_emp               object
totalOperatingExpenses            float64
totalOperatingExpenses_per_emp     object
mostly_remote                      object
dtype: object

In [171]:
df

Unnamed: 0,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,operatingIncome,operatingIncome_per_emp,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote
0,2021,WMT,Walmart Inc.,Retail Trade,2300000,,,,,,,,,,,
1,2021,AMZN,"Amazon.com, Inc.",Retail Trade,1298000,,,,,,,,,,,
2,2021,ACN,Accenture plc,Technology Services,624000,,,,,,,,,,,
3,2021,UPS,"United Parcel Service, Inc.",Transportation,543000,,,,,,,,,,,
4,2021,HD,"Home Depot, Inc. (The)",Retail Trade,504800,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,2021,FRT,Federal Realty Investment Trust,Finance,307,,,,,,,,,,,
494,2021,PEAK,"Healthpeak Properties, Inc.",Finance,217,,,,,,,,,,,
495,2021,O,Realty Income Corporation,Finance,210,,,,,,,,,,,
496,2021,HST,Host Hotels,Finance,163,,,,,,,,,,,


In [172]:
# Set random values for numeric values

a = df.netIncome.isnull()

rand_int = np.random.randint(1000000000, size = a.sum())

df.loc[a, 'netIncome'] = rand_int

b = df.grossProfit.isnull()

rand_int2 = np.random.randint(1000000000, size = b.sum())

df.loc[b, 'grossProfit'] = rand_int2

c = df.operatingIncome.isnull()

rand_int3 = np.random.randint(1000000000, size = c.sum())

df.loc[c, 'operatingIncome'] = rand_int3

d = df.totalRevenue.isnull()

rand_int4 = np.random.randint(1000000000, size = d.sum())

df.loc[d, 'totalRevenue'] = rand_int4

e = df.totalOperatingExpenses.isnull()

rand_int5 = np.random.randint(1000000000, size = e.sum())

df.loc[e, 'totalOperatingExpenses'] = rand_int5

In [173]:
df

Unnamed: 0,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,operatingIncome,operatingIncome_per_emp,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote
0,2021,WMT,Walmart Inc.,Retail Trade,2300000,660520661,,88360892,,825859804,,172196312,,743101160,,
1,2021,AMZN,"Amazon.com, Inc.",Retail Trade,1298000,980173304,,705567297,,894557137,,77381297,,82905570,,
2,2021,ACN,Accenture plc,Technology Services,624000,11874548,,135634647,,906664141,,290468786,,577047230,,
3,2021,UPS,"United Parcel Service, Inc.",Transportation,543000,471977569,,267986240,,602474640,,174914891,,209622062,,
4,2021,HD,"Home Depot, Inc. (The)",Retail Trade,504800,17804705,,263857298,,998245511,,42049250,,116114910,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,2021,FRT,Federal Realty Investment Trust,Finance,307,276298835,,380831822,,89733225,,836538368,,532057165,,
494,2021,PEAK,"Healthpeak Properties, Inc.",Finance,217,103844190,,891775038,,484213025,,669440056,,683590735,,
495,2021,O,Realty Income Corporation,Finance,210,717209057,,59652794,,544687195,,104288784,,932769499,,
496,2021,HST,Host Hotels,Finance,163,54576295,,967390772,,70187780,,424928860,,781929374,,


In [174]:
# Set random values for mostly_remote column, 1 or 0

f = df.mostly_remote.isnull()

rand_int6 = np.random.randint(2, size = f.sum())

df.loc[f, 'mostly_remote'] = rand_int6

In [175]:
df.dtypes

Year                               int64
Ticker                            object
Company                           object
Sector                            object
numEmployees                       int64
netIncome                          int32
netIncome_per_emp                 object
grossProfit                        int32
grossProfit_per_emp               object
operatingIncome                    int32
operatingIncome_per_emp           object
totalRevenue                       int32
totalRevenue_per_emp              object
totalOperatingExpenses             int32
totalOperatingExpenses_per_emp    object
mostly_remote                      int32
dtype: object

In [176]:


df

Unnamed: 0,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,operatingIncome,operatingIncome_per_emp,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote
0,2021,WMT,Walmart Inc.,Retail Trade,2300000,660520661,,88360892,,825859804,,172196312,,743101160,,0
1,2021,AMZN,"Amazon.com, Inc.",Retail Trade,1298000,980173304,,705567297,,894557137,,77381297,,82905570,,1
2,2021,ACN,Accenture plc,Technology Services,624000,11874548,,135634647,,906664141,,290468786,,577047230,,0
3,2021,UPS,"United Parcel Service, Inc.",Transportation,543000,471977569,,267986240,,602474640,,174914891,,209622062,,0
4,2021,HD,"Home Depot, Inc. (The)",Retail Trade,504800,17804705,,263857298,,998245511,,42049250,,116114910,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,2021,FRT,Federal Realty Investment Trust,Finance,307,276298835,,380831822,,89733225,,836538368,,532057165,,1
494,2021,PEAK,"Healthpeak Properties, Inc.",Finance,217,103844190,,891775038,,484213025,,669440056,,683590735,,1
495,2021,O,Realty Income Corporation,Finance,210,717209057,,59652794,,544687195,,104288784,,932769499,,0
496,2021,HST,Host Hotels,Finance,163,54576295,,967390772,,70187780,,424928860,,781929374,,0


In [177]:
# Populate numeric values/employee columns

df["netIncome_per_emp"] = df["netIncome"]/df["numEmployees"]

df["grossProfit_per_emp"] = df["grossProfit"]/df["numEmployees"]

df["operatingIncome_per_emp"] = df["operatingIncome"]/df["numEmployees"]

df["totalRevenue_per_emp"] = df["totalRevenue"]/df["numEmployees"]

df["totalOperatingExpenses_per_emp"] = df["totalOperatingExpenses"]/df["numEmployees"]

df

Unnamed: 0,Year,Ticker,Company,Sector,numEmployees,netIncome,netIncome_per_emp,grossProfit,grossProfit_per_emp,operatingIncome,operatingIncome_per_emp,totalRevenue,totalRevenue_per_emp,totalOperatingExpenses,totalOperatingExpenses_per_emp,mostly_remote
0,2021,WMT,Walmart Inc.,Retail Trade,2300000,660520661,2.871829e+02,88360892,3.841778e+01,825859804,3.590695e+02,172196312,7.486796e+01,743101160,3.230875e+02,0
1,2021,AMZN,"Amazon.com, Inc.",Retail Trade,1298000,980173304,7.551412e+02,705567297,5.435804e+02,894557137,6.891812e+02,77381297,5.961579e+01,82905570,6.387178e+01,1
2,2021,ACN,Accenture plc,Technology Services,624000,11874548,1.902972e+01,135634647,2.173632e+02,906664141,1.452987e+03,290468786,4.654948e+02,577047230,9.247552e+02,0
3,2021,UPS,"United Parcel Service, Inc.",Transportation,543000,471977569,8.692036e+02,267986240,4.935290e+02,602474640,1.109530e+03,174914891,3.221269e+02,209622062,3.860443e+02,0
4,2021,HD,"Home Depot, Inc. (The)",Retail Trade,504800,17804705,3.527081e+01,263857298,5.226967e+02,998245511,1.977507e+03,42049250,8.329883e+01,116114910,2.300216e+02,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,2021,FRT,Federal Realty Investment Trust,Finance,307,276298835,8.999962e+05,380831822,1.240495e+06,89733225,2.922906e+05,836538368,2.724881e+06,532057165,1.733085e+06,1
494,2021,PEAK,"Healthpeak Properties, Inc.",Finance,217,103844190,4.785447e+05,891775038,4.109562e+06,484213025,2.231396e+06,669440056,3.084977e+06,683590735,3.150188e+06,1
495,2021,O,Realty Income Corporation,Finance,210,717209057,3.415281e+06,59652794,2.840609e+05,544687195,2.593749e+06,104288784,4.966133e+05,932769499,4.441760e+06,0
496,2021,HST,Host Hotels,Finance,163,54576295,3.348239e+05,967390772,5.934913e+06,70187780,4.305999e+05,424928860,2.606926e+06,781929374,4.797113e+06,0


In [178]:
df.dtypes

Year                                int64
Ticker                             object
Company                            object
Sector                             object
numEmployees                        int64
netIncome                           int32
netIncome_per_emp                 float64
grossProfit                         int32
grossProfit_per_emp               float64
operatingIncome                     int32
operatingIncome_per_emp           float64
totalRevenue                        int32
totalRevenue_per_emp              float64
totalOperatingExpenses              int32
totalOperatingExpenses_per_emp    float64
mostly_remote                       int32
dtype: object

In [179]:
# Export to SQL database

db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/group_project"

engine = create_engine(db_string)

df.to_sql(name='provisional_database', con=engine)
