In [58]:
import os
import pandas as pd

In [59]:
INPUT_FOLDER = "raw"
OUTPUT_FOLDER = "preproc"

In [60]:
os.makedirs("data/preproc", exist_ok=True)

In [61]:
datasources = {source for source in os.listdir(f"data/{INPUT_FOLDER}")}
datasources

{'cache',
 'download',
 'industries.csv',
 'info',
 'us-balance-quarterly.csv',
 'us-cashflow-quarterly.csv',
 'us-companies.csv',
 'us-income-quarterly.csv',
 'us-shareprices-daily.csv'}

#### Table: `us-companies`

In [62]:
key = "us-companies"
data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv", delimiter=";")
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6189 entries, 0 to 6188
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Ticker                         6117 non-null   object 
 1   SimFinId                       6189 non-null   int64  
 2   Company Name                   6119 non-null   object 
 3   IndustryId                     5898 non-null   float64
 4   ISIN                           5091 non-null   object 
 5   End of financial year (month)  6120 non-null   float64
 6   Number Employees               5427 non-null   float64
 7   Business Summary               5879 non-null   object 
 8   Market                         6189 non-null   object 
 9   CIK                            6177 non-null   float64
 10  Main Currency                  6189 non-null   object 
dtypes: float64(4), int64(1), object(6)
memory usage: 532.0+ KB


In [63]:
data.head()

Unnamed: 0,Ticker,SimFinId,Company Name,IndustryId,ISIN,End of financial year (month),Number Employees,Business Summary,Market,CIK,Main Currency
0,,18538670,,,,,,,us,1734107.0,USD
1,,14159407,,,,,,,us,1823529.0,USD
2,,14159427,,,,,,,us,1680367.0,USD
3,,15112475,,,,,,,us,1841968.0,USD
4,,13676402,,,,,,,us,1751788.0,USD


In [64]:
data['Company Name'].duplicated().sum()


88

In [65]:
data['ISIN'].duplicated().sum()


1098

In [66]:
data['CIK'].duplicated().sum()


86

In [67]:
data.isnull().sum()

Ticker                             72
SimFinId                            0
Company Name                       70
IndustryId                        291
ISIN                             1098
End of financial year (month)      69
Number Employees                  762
Business Summary                  310
Market                              0
CIK                                12
Main Currency                       0
dtype: int64

In [68]:
data = data[~data['ISIN'].duplicated()]


In [69]:
data['IndustryId'].fillna(0,inplace=True)
data['ISIN'].fillna(0,inplace=True)
data['CIK'].fillna(0,inplace=True)
data['Number Employees'].fillna(0,inplace=True)
data['Business Summary'].fillna('Unknown',inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['IndustryId'].fillna(0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ISIN'].fillna(0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves 

In [70]:
data.to_csv(f"data/{OUTPUT_FOLDER}/{key}.csv", index=False)

We have to fill in the unknown company names

In [71]:
key = "us-shareprices-daily"
data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv",delimiter=";")
data.info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5898046 entries, 0 to 5898045
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Ticker              5898046 non-null  object 
 1   SimFinId            5898046 non-null  int64  
 2   Date                5898046 non-null  object 
 3   Open                5898046 non-null  float64
 4   High                5898046 non-null  float64
 5   Low                 5898046 non-null  float64
 6   Close               5898046 non-null  float64
 7   Adj. Close          5898046 non-null  float64
 8   Volume              5898046 non-null  int64  
 9   Dividend            35968 non-null    float64
 10  Shares Outstanding  5341425 non-null  float64
dtypes: float64(7), int64(2), object(2)
memory usage: 495.0+ MB


In [72]:
data.head(10)

Unnamed: 0,Ticker,SimFinId,Date,Open,High,Low,Close,Adj. Close,Volume,Dividend,Shares Outstanding
0,A,45846,2019-04-29,77.47,78.44,77.32,77.33,74.27,2177700,,317515869.0
1,A,45846,2019-04-30,77.44,78.59,77.38,78.5,75.39,1726239,,317000000.0
2,A,45846,2019-05-01,78.49,78.92,77.28,77.47,74.4,1078572,,317000000.0
3,A,45846,2019-05-02,77.41,78.34,77.15,78.2,75.1,946764,,317000000.0
4,A,45846,2019-05-03,78.58,79.56,78.58,79.29,76.15,1303198,,317000000.0
5,A,45846,2019-05-06,77.62,79.41,77.21,79.35,76.21,1624324,,317000000.0
6,A,45846,2019-05-07,78.59,78.77,76.03,76.67,73.64,1732161,,317000000.0
7,A,45846,2019-05-08,76.78,77.3,76.13,76.61,73.58,2243580,,317000000.0
8,A,45846,2019-05-09,75.67,77.06,74.85,77.0,73.95,1717541,,317000000.0
9,A,45846,2019-05-10,76.64,77.14,74.09,77.1,74.05,2849795,,317000000.0


In [73]:
data.isnull().sum()

Ticker                      0
SimFinId                    0
Date                        0
Open                        0
High                        0
Low                         0
Close                       0
Adj. Close                  0
Volume                      0
Dividend              5862078
Shares Outstanding     556621
dtype: int64

Filling the Dividend NA with 0 because either the company does not give any dividends or it is not due on this day


In [74]:
data['Dividend'].fillna(0,inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Dividend'].fillna(0,inplace=True)


In [75]:
data['Shares Outstanding'].fillna(data['Shares Outstanding'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Shares Outstanding'].fillna(data['Shares Outstanding'].mean(), inplace=True)


In [76]:
data.to_csv(f"data/{OUTPUT_FOLDER}/{key}.csv", index=False)

In [77]:
key = "industries"
data = pd.read_csv(f"data/{INPUT_FOLDER}/{key}.csv", delimiter=";")
data.info(show_counts=True)
data.to_csv(f"data/{OUTPUT_FOLDER}/{key}.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   IndustryId  74 non-null     int64 
 1   Industry    74 non-null     object
 2   Sector      74 non-null     object
dtypes: int64(1), object(2)
memory usage: 1.9+ KB
