### PACKAGES

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from time import time
import numpy as np
from sklearn.utils import shuffle

"""CD INTO THE CAP DIRECTORY"""
os.chdir('..\\..\\..\\...\\...\\CAP2020')

### DEFINE FUNCTIONS

#### timeit wrapper

In [2]:
def timing(f):
    def wrap(*args):
        time1 = time()
        ret = f(*args)
        time2 = time()
        print('{:s} function took {:.3f} ms'.format(f.__name__, (time2-time1)*1000.0))

        return ret
    return wrap

In [3]:
@timing
def run_func():
    print('hi')
run_func()

hi
run_func function took 0.000 ms


### IMPORT DATA

In [4]:
@timing
def import_data():
    
    #converter = lambda x: pd.to_numeric(x, 'coerce')
    #df = pd.read_csv('data\\raw\\PENN_SP500_daily.csv', delim_whitespace=True, converters={1: converter}, header=None)
    df = pd.read_csv('data\\raw\\PENN_SP500_daily.csv')
    #df = shuffle(df, random_state=0)
    df = df[['date',
    'TICKER',
    'COMNAM',
    'BIDLO',
    'ASKHI',
    'PRC',
    'VOL',
    'RET',
    'BID',
    'ASK',
    'SHROUT',
    'OPENPRC']]

    return df
    
df = import_data()
df.columns = ['date', 'symbol', 'name', 'bidlo', 'askhi', 'price', 'vol', 'ret', 'bid', 'ask', 'sout', 'open']
print('initial shape {}'.format(df.shape))

df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


import_data function took 16489.435 ms
initial shape (2337279, 12)


Unnamed: 0,date,symbol,name,bidlo,askhi,price,vol,ret,bid,ask,sout,open
0,20000103,ORCL,ORACLE CORP,111.625,125.1875,118.125,24831819.0,0.054099,118.125,118.1875,1423672.0,124.625
1,20000104,ORCL,ORACLE CORP,105.0,118.625,107.6875,29568565.0,-0.08836,107.6875,107.75,1423672.0,115.5
2,20000105,ORCL,ORACLE CORP,96.0,106.375,102.0,42032160.0,-0.052815,102.0,102.0,1423672.0,101.625
3,20000106,ORCL,ORACLE CORP,94.6875,105.0,96.0,27848885.0,-0.058824,96.0,95.875,1423672.0,100.15625
4,20000107,ORCL,ORACLE CORP,93.5625,103.5,103.375,23218034.0,0.076823,103.25,103.375,1423672.0,95.0


### BEGIN CLEANING

### DROP DUPLICATES

In [5]:
df = df.drop_duplicates()
print(print('new shape: {}'.format(df.shape)))

new shape: (2337166, 12)
None


### DROP ROWS WITH WRONG VALUES

In [248]:
#df[df['ret']=='B'].index

Int64Index([  98558,   98559,   98560,   98561,   98562,   98563,   98564,
             367116,  367117,  367118,
            ...
            1760359, 1760360, 1760361, 1760362, 1760363, 1760364, 1760365,
            1760366, 1760367, 1760368],
           dtype='int64', length=487)

In [6]:
"""MESSY DATA"""
indices_to_drop = df[df['ret']=='B'].index
df = df.drop(index=indices_to_drop)

indices_to_drop = df[df['ret']=='C'].index
df = df.drop(index=indices_to_drop)
#print('{} rows were dropped'.format(len(indices_to_drop)))

### STANDARDIZE, FORMALIZE COLUMNS (make into proper datatype)

#### CHANGE date to DATETIME

In [7]:
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d', errors='ignore')
df.head()

Unnamed: 0,date,symbol,name,bidlo,askhi,price,vol,ret,bid,ask,sout,open
0,2000-01-03,ORCL,ORACLE CORP,111.625,125.1875,118.125,24831819.0,0.054099,118.125,118.1875,1423672.0,124.625
1,2000-01-04,ORCL,ORACLE CORP,105.0,118.625,107.6875,29568565.0,-0.08836,107.6875,107.75,1423672.0,115.5
2,2000-01-05,ORCL,ORACLE CORP,96.0,106.375,102.0,42032160.0,-0.052815,102.0,102.0,1423672.0,101.625
3,2000-01-06,ORCL,ORACLE CORP,94.6875,105.0,96.0,27848885.0,-0.058824,96.0,95.875,1423672.0,100.15625
4,2000-01-07,ORCL,ORACLE CORP,93.5625,103.5,103.375,23218034.0,0.076823,103.25,103.375,1423672.0,95.0


#### CHANGE symbol, name columns to string

In [9]:
df['symbol'] = df['symbol'].astype(str)
df['name'] = df['name'].astype(str)

### CHANGE bidlo, askhi, price, vol, ret, bid, ask, sout, open to FLOAT

In [10]:
df['bidlo'] = df['bidlo'].astype(float)
df['askhi'] = df['askhi'].astype(float)
df['price'] = df['price'].astype(float)
df['vol'] = df['vol'].astype(float)
df['ret'] = df['ret'].astype(float)
df['bid'] = df['bid'].astype(float)
df['ask'] = df['ask'].astype(float)
df['sout'] = df['sout'].astype(float)
df['open'] = df['open'].astype(float)

#### CREATE NEW PRIMARY KEY TO BE APPLIED TO EACH TABLE

In [11]:
@timing
def create_key():
    key_list = []
    for i in range(len(df)):
        term = str(df.iloc[i, 1])
        if not term:
            value  = '0_'+df.iloc[i, 2]
            key_list.append(value)
        else:
            value = str(df.iloc[i, 1])+'_'+str(df.iloc[i, 2])
            key_list.append(value)

    return key_list

key_list = create_key()
df['key'] = key_list
del key_list

df.head()

create_key function took 67126.201 ms


Unnamed: 0,date,symbol,name,bidlo,askhi,price,vol,ret,bid,ask,sout,open,key
0,2000-01-03,ORCL,ORACLE CORP,111.625,125.1875,118.125,24831819.0,0.054099,118.125,118.1875,1423672.0,124.625,ORCL_ORACLE CORP
1,2000-01-04,ORCL,ORACLE CORP,105.0,118.625,107.6875,29568565.0,-0.08836,107.6875,107.75,1423672.0,115.5,ORCL_ORACLE CORP
2,2000-01-05,ORCL,ORACLE CORP,96.0,106.375,102.0,42032160.0,-0.052815,102.0,102.0,1423672.0,101.625,ORCL_ORACLE CORP
3,2000-01-06,ORCL,ORACLE CORP,94.6875,105.0,96.0,27848885.0,-0.058824,96.0,95.875,1423672.0,100.15625,ORCL_ORACLE CORP
4,2000-01-07,ORCL,ORACLE CORP,93.5625,103.5,103.375,23218034.0,0.076823,103.25,103.375,1423672.0,95.0,ORCL_ORACLE CORP


## CREATE TABLES/ENTITIES
###### ...
###### ...

### NAME TABLE

In [254]:
df_name = df[['symbol', 'name', 'key']]
df_name = df_name.drop_duplicates()
df_name.index = df_name['key']
df_name = df_name.drop('key', axis=1)
df_name.to_csv('data\\interim\\df_symbols_names.csv')
df_name.head()

Unnamed: 0_level_0,symbol,name
key,Unnamed: 1_level_1,Unnamed: 2_level_1
ORCL_ORACLE CORP,ORCL,ORACLE CORP
MSFT_MICROSOFT CORP,MSFT,MICROSOFT CORP
TROW_T ROWE PRICE ASSOC INC,TROW,T ROWE PRICE ASSOC INC
TROW_T ROWE PRICE GROUP INC,TROW,T ROWE PRICE GROUP INC
HON_HONEYWELL INTERNATIONAL INC,HON,HONEYWELL INTERNATIONAL INC
