In [1]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display
InteractiveShell.ast_node_interactivity = "all"

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd

In [4]:
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [5]:
SHEET_NAME_STOCK_INFO = "stock_info"
SHEET_NAME_STOCK_FINANCIALS = "stock_financials"
SHEET_NAME_STOCK_DIVIDENDS = "stock_dividends"

# Data Cleansing

In [6]:
FROM_FILE = "data/alphalib_united_states_5.xlsx"
TO_FILE = "data/alphalib_united_states_new.xlsx"

stock_info_1 = pd.read_excel(TO_FILE, sheet_name=SHEET_NAME_STOCK_INFO, engine="openpyxl")
stock_info_2 = pd.read_excel(FROM_FILE, sheet_name=SHEET_NAME_STOCK_INFO, engine="openpyxl")
print(len(stock_info_1.columns), len(stock_info_2.columns))

156 153


In [7]:
def create_missing_cols(df, target_cols):
    columns = df.columns.tolist()
    missing_cols = list(set(target_cols) - set(columns))
    df[missing_cols] = None

In [8]:
# Create missing columns for stock_info
print("Create missing columns")
if len(stock_info_1.columns) > len(stock_info_2.columns):
    stock_info_columns = stock_info_1.columns.tolist()
    stock_info_columns.sort()
    create_missing_cols(stock_info_2, stock_info_columns)
    stock_info_2 = stock_info_2[stock_info_columns]
    print(len(stock_info_1.columns), len(stock_info_2.columns))
elif len(stock_info_1.columns) < len(stock_info_2.columns):
    stock_info_columns = stock_info_2.columns.tolist()
    stock_info_columns.sort()
    create_missing_cols(stock_info_1, stock_info_columns)
    stock_info_1 = stock_info_1[stock_info_columns]
    print(len(stock_info_1.columns), len(stock_info_2.columns))
else:
    print("No action required")
    print(len(stock_info_1.columns), len(stock_info_2.columns))
    
# Remove overlapped stocks
print("Remove overlapped stocks")
filter = ~stock_info_2["symbol"].isin(stock_info_1["symbol"].tolist()) 
print(len(stock_info_2),len(stock_info_2[filter]))
stock_info_2 = stock_info_2[filter]

Create missing columns
156 156
Remove overlapped stocks
1061 1055


In [9]:
# Merge the 2 data frames
stock_info_all = pd.concat([stock_info_1, stock_info_2], ignore_index=True)
print(len(stock_info_all))

3570


In [10]:
# Clean up stock_dividends
stock_dividends_1 = pd.read_excel(TO_FILE, sheet_name=SHEET_NAME_STOCK_DIVIDENDS, engine="openpyxl")
stock_dividends_2 = pd.read_excel(FROM_FILE, sheet_name=SHEET_NAME_STOCK_DIVIDENDS, engine="openpyxl")
print(len(stock_dividends_1.columns), len(stock_dividends_2.columns))

print(stock_dividends_1.columns)
print(stock_dividends_2.columns)

cols_1 = stock_dividends_1.columns.tolist()
cols_1.sort()

cols_2 = stock_dividends_2.columns.tolist()
cols_2.sort()

if (cols_1 == cols_2):
    print("Same columns")
else:
    print("!!!!!!!!!!! Not the same ")
    if len(stock_dividends_1.columns) > len(stock_dividends_2.columns):
        stock_dividends_columns = stock_dividends_1.columns.tolist()
        stock_dividends_columns.sort()
        create_missing_cols(stock_dividends_2, stock_dividends_columns)
        stock_dividends_2 = stock_dividends_2[stock_dividends_columns]
        print(len(stock_dividends_1.columns), len(stock_dividends_2.columns))
    elif len(stock_dividends_1.columns) < len(stock_dividends_2.columns):
        stock_dividends_columns = stock_dividends_2.columns.tolist()
        stock_dividends_columns.sort()
        create_missing_cols(stock_dividends_1, stock_dividends_columns)
        stock_dividends_1 = stock_dividends_1[stock_dividends_columns]
        print(len(stock_dividends_1.columns), len(stock_dividends_2.columns))

9 9
Index(['Country', 'Date', 'Dividend', 'Full Name', 'Name', 'Payment Date',
       'Symbol', 'Type', 'Yield'],
      dtype='object')
Index(['Country', 'Date', 'Dividend', 'Full Name', 'Name', 'Payment Date',
       'Symbol', 'Type', 'Yield'],
      dtype='object')
Same columns


In [11]:
print("Remove overlapped stocks")
filter = ~stock_dividends_2["Symbol"].isin(stock_dividends_1["Symbol"].tolist()) 
print(len(stock_dividends_2), len(stock_dividends_2[filter]))
stock_dividends_2 = stock_dividends_2[filter]

# Merge the 2 data frames
stock_dividends_all = pd.concat([stock_dividends_1, stock_dividends_2], ignore_index=True)
print(len(stock_dividends_all))
print(len(stock_dividends_1), len(stock_dividends_2))

Remove overlapped stocks
10173 10097
52315
42218 10097


In [12]:
# Clean up stock_financials
stock_financials_1 = pd.read_excel(TO_FILE, sheet_name=SHEET_NAME_STOCK_FINANCIALS, engine="openpyxl")
stock_financials_2 = pd.read_excel(FROM_FILE, sheet_name=SHEET_NAME_STOCK_FINANCIALS, engine="openpyxl")
print(len(stock_financials_1.columns), len(stock_financials_2.columns))

cols_1 = stock_financials_1.columns.tolist()
cols_1.sort()

cols_2 = stock_financials_2.columns.tolist()
cols_2.sort()

if (cols_1 == cols_2):
    print("Same columns")
    print(cols_1)
else:
    print("!!!!!!!!!!! Not the same ")
    if len(stock_financials_1.columns) > len(stock_financials_2.columns):
        stock_financials_columns = stock_financials_1.columns.tolist()
        stock_financials_columns.sort()
        create_missing_cols(stock_financials_2, stock_financials_columns)
        stock_financials_2 = stock_financials_2[stock_financials_columns]
        print(len(stock_financials_1.columns), len(stock_financials_2.columns))
    elif len(stock_financials_1.columns) < len(stock_financials_2.columns):
        stock_financials_columns = stock_financials_2.columns.tolist()
        stock_financials_columns.sort()
        create_missing_cols(stock_financials_1, stock_financials_columns)
        stock_financials_1 = stock_financials_1[stock_financials_columns]
        print(len(stock_financials_1.columns), len(stock_financials_2.columns))

27 27
Same columns
['Cost Of Revenue', 'Country', 'Date', 'Discontinued Operations', 'Ebit', 'Effect Of Accounting Charges', 'Extraordinary Items', 'Full Name', 'Gross Profit', 'Income Before Tax', 'Income Tax Expense', 'Interest Expense', 'Minority Interest', 'Name', 'Net Income', 'Net Income Applicable To Common Shares', 'Net Income From Continuing Ops', 'Non Recurring', 'Operating Income', 'Other Items', 'Other Operating Expenses', 'Research Development', 'Selling General Administrative', 'Symbol', 'Total Operating Expenses', 'Total Other Income Expense Net', 'Total Revenue']


In [13]:
print("Remove overlapped stocks")
filter = ~stock_financials_2["Symbol"].isin(stock_financials_1["Symbol"].tolist()) 
print(len(stock_financials_2), len(stock_financials_2[filter]))
stock_financials_2 = stock_financials_2[filter]

# Merge the 2 data frames
stock_financials_all = pd.concat([stock_financials_1, stock_financials_2], ignore_index=True)
print(len(stock_financials_all))
print(len(stock_financials_1), len(stock_financials_2))

Remove overlapped stocks
4249 4225
14333
10108 4225


In [14]:
# Save to new file
with pd.ExcelWriter(TO_FILE) as writer:  
    stock_info_all.to_excel(writer,sheet_name=SHEET_NAME_STOCK_INFO, header=True, index=False)
    stock_dividends_all.to_excel(writer,sheet_name=SHEET_NAME_STOCK_DIVIDENDS, header=True, index=False)
    stock_financials_all.to_excel(writer,sheet_name=SHEET_NAME_STOCK_FINANCIALS, header=True, index=False)

In [15]:
# validate the data
stock_info_new = pd.read_excel(TO_FILE, sheet_name=SHEET_NAME_STOCK_INFO, engine="openpyxl")
print(len(stock_info_new))

3570
