In [1]:
import os
import pandas as pd

In [2]:
INPUT_FOLDER = "raw"
OUTPUT_FOLDER = "preproc"

In [3]:
os.makedirs("data/preproc", exist_ok=True)

In [4]:
datasources = {source for source in os.listdir(f"data/{INPUT_FOLDER}")}
datasources

{'us_balance_quarterly.csv',
 'us_cashflow_quarterly.csv',
 'us_companies.csv',
 'us_income_quarterly.csv',
 'us_industries.csv',
 'us_shareprices_daily.csv'}

#### Table: `us-companies`

In [5]:
key = "us_companies"
data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv")
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6098 entries, 0 to 6097
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SimFinId                       6098 non-null   int64  
 1   Company Name                   6025 non-null   object 
 2   IndustryId                     5811 non-null   float64
 3   ISIN                           5011 non-null   object 
 4   End of financial year (month)  6026 non-null   float64
 5   Number Employees               5343 non-null   float64
 6   Business Summary               5787 non-null   object 
 7   Market                         6098 non-null   object 
 8   CIK                            6086 non-null   float64
 9   Main Currency                  6098 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 476.5+ KB


In [6]:
data.head()

Unnamed: 0,SimFinId,Company Name,IndustryId,ISIN,End of financial year (month),Number Employees,Business Summary,Market,CIK,Main Currency
0,45846,AGILENT TECHNOLOGIES INC,106001.0,US00846U1016,10.0,16400.0,Agilent Technologies Inc is engaged in life sc...,us,1090872.0,USD
1,1333027,Li Auto Inc.,,,12.0,,,us,1791706.0,USD
2,367153,Alcoa Corp,110004.0,US0138721065,12.0,12900.0,Alcoa Corp is an integrated aluminum company. ...,us,1675149.0,USD
3,7962652,Ares Acquisition Corporation,104002.0,US0003071083,12.0,,Ares Acquisition Corporation does not have sig...,us,1829432.0,USD
4,11820349,Armada Acquisition Corp. I,104002.0,US04208V1035,9.0,,Armada Acquisition Corp. I focuses on effectin...,us,1844817.0,USD


In [7]:
data['Company Name'].duplicated().sum()


91

In [8]:
data['ISIN'].duplicated().sum()


1087

In [9]:
data['CIK'].duplicated().sum()


86

In [37]:
data.isnull().sum()

SimFinId                         0
Company Name                     0
IndustryId                       0
ISIN                             0
End of financial year (month)    0
Number Employees                 0
Business Summary                 0
Market                           0
CIK                              0
Main Currency                    0
dtype: int64

In [11]:
data = data[~data['ISIN'].duplicated()]


In [36]:
data['IndustryId'].fillna(0,inplace=True)
data['ISIN'].fillna(0,inplace=True)
data['CIK'].fillna(0,inplace=True)
data['Number Employees'].fillna(0,inplace=True)
data['Business Summary'].fillna('Unknown',inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['IndustryId'].fillna(0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['CIK'].fillna(0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves a

In [38]:
data.to_csv(f"data/{OUTPUT_FOLDER}/{key}.csv", index=False)

We have to fill in the unknown company names

In [40]:
key = "us_shareprices_daily"
data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv")
data.info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5799855 entries, 0 to 5799854
Data columns (total 9 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   SimFinId            5799855 non-null  int64  
 1   Open                5799855 non-null  float64
 2   High                5799855 non-null  float64
 3   Low                 5799855 non-null  float64
 4   Close               5799855 non-null  float64
 5   Adj. Close          5799855 non-null  float64
 6   Volume              5799855 non-null  int64  
 7   Dividend            35226 non-null    float64
 8   Shares Outstanding  5274656 non-null  float64
dtypes: float64(7), int64(2)
memory usage: 398.2 MB


In [43]:
data.head(10)

Unnamed: 0,SimFinId,Open,High,Low,Close,Adj. Close,Volume,Dividend,Shares Outstanding
0,45846,81.57,81.71,80.58,81.69,78.46,783350,,317515869.0
1,45846,81.56,81.72,81.27,81.42,78.2,1254742,,317515869.0
2,45846,81.45,82.24,81.45,81.68,78.45,982886,,317515869.0
3,45846,81.88,81.92,80.89,81.08,77.87,1071479,,317515869.0
4,45846,81.43,82.06,80.9,80.98,77.77,1249295,,317515869.0
5,45846,81.0,81.13,79.91,80.4,77.22,1627268,,317515869.0
6,45846,80.82,80.96,77.19,77.55,74.48,3441597,,317515869.0
7,45846,78.15,78.32,74.46,75.43,72.44,4471971,,317515869.0
8,45846,75.73,76.54,75.31,76.17,73.16,2874195,,317515869.0
9,45846,75.93,76.72,75.13,75.57,72.58,2016043,,317515869.0


In [51]:
data.isnull().sum()

SimFinId              0
Open                  0
High                  0
Low                   0
Close                 0
Adj. Close            0
Volume                0
Dividend              0
Shares Outstanding    0
dtype: int64

Filling the Dividend NA with 0 because either the company does not give any dividends or it is not due on this day


In [None]:
data['Dividend'].fillna(0,inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Dividend'].fillna(0,inplace=True)


In [50]:
data['Shares Outstanding'].fillna(data['Shares Outstanding'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Shares Outstanding'].fillna(data['Shares Outstanding'].mean(), inplace=True)


In [52]:
data.to_csv(f"data/{OUTPUT_FOLDER}/{key}.csv", index=False)