In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../data/raw-data/american_companies.csv')

In [3]:
# find how many companies are there based on grouping by company_name
company_count = df.groupby('company_name').size()

In [4]:
df.columns

Index(['company_name', 'status_label', 'year', 'X1', 'X2', 'X3', 'X4', 'X5',
       'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16',
       'X17', 'X18'],
      dtype='object')

In [5]:

company_count.size

8971

In [6]:

# Define industries with typical and probabilities
industries = {
    'Technology': 0.2,
    'Finance': 0.15,
    'Healthcare': 0.15,
    'Retail': 0.27,
    'Manufacturing': 0.23
}

industry_list = list(industries.keys())
industry_probs = list(industries.values())

# Define states with probabilities based on hypothetical business density
states = [
    'California', 'New York', 'Texas', 'Florida', 'Illinois',
    'Washington', 'Georgia', 'Pennsylvania', 'Ohio', 'North Carolina'
]
state_probs = [0.20, 0.15, 0.13, 0.10, 0.08, 0.07, 0.06, 0.05, 0.09, 0.07]

# Define company status with a realistic distribution
company_statuses = ['Public', 'Private']
status_probs = [0.17, 0.83]  # e.g., 10% public, 90% private

# Random seed for reproducibility
np.random.seed(33)

# Assign random attributes with probabilities
df['state'] = np.random.choice(states, size=len(df), p=state_probs)
df['industry'] = np.random.choice(industry_list, size=len(df), p=industry_probs)
df['company_status'] = np.random.choice(company_statuses, size=len(df), p=status_probs)


In [7]:
# rename the column company_name to company_id
df.rename(columns={'company_name': 'company_id'}, inplace=True)


In [8]:
from faker import Faker


# Set up Faker for generating company names
fake = Faker()

# Step 1: Get all unique company IDs from the DataFrame
unique_companies = df['company_id'].unique()

# Step 2: Generate a unique company name for each unique company ID
# Ensure uniqueness by generating names in a loop
company_names = {}
for company_id in unique_companies:
    new_name = fake.company()
    while new_name in company_names.values():  # Ensure no duplicates across the entire dataset
        new_name = fake.company()
    company_names[company_id] = new_name

# Step 3: Map the new names to the DataFrame using the company_id
df['company_name'] = df['company_id'].map(company_names)


In [9]:
# Step 1: Calculate the minimum year for each company
min_year_by_company = df.groupby('company_id')['year'].min()

# Step 2: Generate the "year founded" which is one year less than the minimum year found
year_founded = min_year_by_company.apply(lambda x: x - np.random.randint(1, 5))  

# Step 3: Map the "year founded" back to the original DataFrame
df['year_founded'] = df['company_id'].map(year_founded)

# Display the updated DataFrame to check the new 'year_founded' column
print(df[['company_id', 'year', 'year_founded']].sample(10))

      company_id  year  year_founded
35695     C_3592  2013          1997
34364     C_3460  2005          1998
11122      C_926  2012          1995
53979     C_5736  2016          1998
60165     C_6443  2005          1995
55685     C_5939  2000          1995
53583     C_5690  1999          1997
20687     C_1739  2006          1997
30366     C_2953  2003          1997
28548     C_2626  2016          2012


In [10]:
# Step 1: Define typical quarter-end dates
quarter_ends = ['03-31', '06-30', '09-30', '12-31']

# Step 2: Create a new 'date' column by appending a randomly chosen quarter-end date to the year
df['date'] = df['year'].apply(lambda x: f"{x}-{np.random.choice(quarter_ends)}")

# Convert the 'date' column to datetime format to ensure correct datatype
df['date'] = pd.to_datetime(df['date'])

# Display the updated DataFrame to verify the new 'date' column
print(df[['company_id', 'year', 'date']].sample(10))


      company_id  year       date
56687     C_6060  2001 2001-03-31
2964       C_246  2014 2014-12-31
38431     C_3952  2002 2002-03-31
31057     C_3039  2005 2005-06-30
75336     C_8354  2016 2016-03-31
63878     C_6877  2002 2002-03-31
59226     C_6343  1999 1999-09-30
1380       C_110  2012 2012-09-30
1609       C_130  2012 2012-09-30
55659     C_5936  2000 2000-06-30


In [11]:
df.columns

Index(['company_id', 'status_label', 'year', 'X1', 'X2', 'X3', 'X4', 'X5',
       'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16',
       'X17', 'X18', 'state', 'industry', 'company_status', 'company_name',
       'year_founded', 'date'],
      dtype='object')

In [13]:
df.sample(3)

Unnamed: 0,company_id,status_label,year,current_assets,cost_of_goods_sold,depreciation_and_amortization,ebitda,inventory,net_income,total_receivables,...,retained_earnings,total_revenue,total_liabilities,total_operating_expenses,state,industry,company_status,company_name,year_founded,date
49728,C_5251,alive,2006,31.832,26.604,2.668,-15.549,2.937,1.853,16.37,...,-0.144,59.05,31.754,50.663,Pennsylvania,Finance,Public,Rodriguez-Johnson,1997,2006-06-30
29406,C_2782,failed,2018,61.253,171.822,2.553,-332.363,0.515,-186.512,1.861,...,-559.35,8.728,88.577,171.822,New York,Technology,Private,Burns-King,2013,2018-09-30
4157,C_341,alive,2008,248.318,780.837,68.273,-566.179,65.497,9.091,116.293,...,21.148,1463.615,1189.066,1317.23,California,Finance,Private,"Leblanc, Johnson and Anderson",1997,2008-03-31


In [12]:
df.to_csv('../../data/raw-data/us_companies.csv', index=False)