In [87]:
import os
import pandas as pd


In [88]:
INPUT_FOLDER = "raw"
OUTPUT_FOLDER = "preproc"

In [89]:
os.makedirs("data/preproc", exist_ok=True)

In [90]:
datasources = {source for source in os.listdir(f"data/{INPUT_FOLDER}")}
datasources

{'industries.csv',
 'us_balance_quarterly.csv',
 'us_cashflow_quarterly.csv',
 'us_companies.csv',
 'us_income_quarterly.csv',
 'us_industries.csv',
 'us_shareprices_daily.csv'}

#### Table: `us-companies`

In [91]:
key = "us_companies"
data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv")
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6101 entries, 0 to 6100
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   SimFinId                       6101 non-null   int64  
 1   Company Name                   6028 non-null   object 
 2   IndustryId                     5814 non-null   float64
 3   ISIN                           5013 non-null   object 
 4   End of financial year (month)  6029 non-null   float64
 5   Number Employees               5346 non-null   float64
 6   Business Summary               5790 non-null   object 
 7   Market                         6101 non-null   object 
 8   CIK                            6089 non-null   float64
 9   Main Currency                  6101 non-null   object 
dtypes: float64(4), int64(1), object(5)
memory usage: 476.8+ KB


In [92]:
data.head()

Unnamed: 0,SimFinId,Company Name,IndustryId,ISIN,End of financial year (month),Number Employees,Business Summary,Market,CIK,Main Currency
0,45846,AGILENT TECHNOLOGIES INC,106001.0,US00846U1016,10.0,16400.0,Agilent Technologies Inc is engaged in life sc...,us,1090872.0,USD
1,1333027,Li Auto Inc.,,,12.0,,,us,1791706.0,USD
2,367153,Alcoa Corp,110004.0,US0138721065,12.0,12900.0,Alcoa Corp is an integrated aluminum company. ...,us,1675149.0,USD
3,7962652,Ares Acquisition Corporation,104002.0,US0003071083,12.0,,Ares Acquisition Corporation does not have sig...,us,1829432.0,USD
4,11820349,Armada Acquisition Corp. I,104002.0,US04208V1035,9.0,,Armada Acquisition Corp. I focuses on effectin...,us,1844817.0,USD


In [93]:
data['Company Name'].duplicated().sum()


91

In [94]:
data['ISIN'].duplicated().sum()


1088

In [95]:
data['CIK'].duplicated().sum()


86

In [96]:
data = data[~data['ISIN'].duplicated()]


In [97]:
data.isnull().sum()

SimFinId                           0
Company Name                       0
IndustryId                        16
ISIN                               1
End of financial year (month)      0
Number Employees                 304
Business Summary                   9
Market                             0
CIK                                1
Main Currency                      0
dtype: int64

In [98]:
data['IndustryId'].fillna(0, inplace=True)
data['ISIN'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['IndustryId'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ISIN'].fillna(0, inplace=True)


In [99]:
data.to_csv(f"data/{OUTPUT_FOLDER}/{key}.csv", index=False)

We have to fill in the unknown company names

In [100]:
key = "us_shareprices_daily"
data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv", delimiter=';')
data.info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5815264 entries, 0 to 5815263
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Ticker              5815264 non-null  object 
 1   SimFinId            5815264 non-null  int64  
 2   Date                5815264 non-null  object 
 3   Open                5815264 non-null  float64
 4   High                5815264 non-null  float64
 5   Low                 5815264 non-null  float64
 6   Close               5815264 non-null  float64
 7   Adj. Close          5815264 non-null  float64
 8   Volume              5815264 non-null  int64  
 9   Dividend            35476 non-null    float64
 10  Shares Outstanding  5287319 non-null  float64
dtypes: float64(7), int64(2), object(2)
memory usage: 488.0+ MB


In [101]:
data.head()


Unnamed: 0,Ticker,SimFinId,Date,Open,High,Low,Close,Adj. Close,Volume,Dividend,Shares Outstanding
0,A,45846,2019-04-11,81.88,81.92,80.89,81.08,77.87,1071479,,317515869.0
1,A,45846,2019-04-12,81.43,82.06,80.9,80.98,77.77,1249295,,317515869.0
2,A,45846,2019-04-15,81.0,81.13,79.91,80.4,77.22,1627268,,317515869.0
3,A,45846,2019-04-16,80.82,80.96,77.19,77.55,74.48,3441597,,317515869.0
4,A,45846,2019-04-17,78.15,78.32,74.46,75.43,72.44,4471971,,317515869.0


In [102]:
data.isnull().sum()

Ticker                      0
SimFinId                    0
Date                        0
Open                        0
High                        0
Low                         0
Close                       0
Adj. Close                  0
Volume                      0
Dividend              5779788
Shares Outstanding     527945
dtype: int64

In [103]:
data['Dividend'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Dividend'].fillna(0, inplace=True)


In [104]:
data.to_csv(f"data/{OUTPUT_FOLDER}/{key}.csv", index=False)