# Technicals extraction

------

**Approach:** Sift through technical functions in `ta.py` file, research sensible parameters for each function. Sometimes it seems like multiple runs should be extracted with each function with different parameters, which is denoted by a list of values. 

### Good to go

- 'MA'
    - n: [5, 20, 90, 260]
- 'STDDEV'
    - n: [5, 20, 90, 260]
- 'RSI'
    - n: [6, 12]
- MACD'
    - n_fast: 12
    - n_slow: 26

- 'BBANDS'
    - n: [5, 20, 90, 260]

- 'MFI' money flow index ratio
    - n: 14
- 'Chaikin'
    - None
- 'EMA'
    - n: [5, 20, 90, 260]
- 'KST'
    - r: (10, 10, 10, 15)
    - n: (10, 15, 20, 30)
    
- 'TSI'
    - r: 25
    - s: 13

- 'TRIX'
    - n: [5, 20, 90, 260]

- 'STOK'
    - None

- 'STO'
    - n: [5, 20, 90, 260]

- 'ROC'
    - n: [5, 20, 90, 260]

- 'PPSR' 

- 'OBV'
    - n: [5, 20, 90, 260]

- 'MassI'
    - None
    
- 'MOM'
    - n: 1 

- 'COPP' 
    - n: 10

- ACCDIST'
    - n: 1

- 'ADX'
    - n: 14
    - n_ADX: 50
- 'ATR'
    - n: 14

### Potential implementation

- differences on any or all of these columns

### Missing end data

'ULTOSC'

'Vortex'

'EOM' ease of movement

'KELCH'

'DONCH'

'CCI' Commodity channel index

In [3]:
cd .. 

/home/jovyan/critical_feature_extraction


In [6]:
%run __init__.py

In [7]:
#load technicals .py file
from lib import ta

import inspect
import string
import os

import pickle

import warnings
warnings.filterwarnings('ignore')

In [8]:
#hacky way to get a dictionary of all the imported technical functions
tech_funcs = dict(filter(lambda x: x[0][0] in string.ascii_uppercase, inspect.getmembers(ta)))

In [9]:
#Each tuple contains the *args for a single run
#Best guesses from the internet, "A critical extraction .." paper, and the ta.py code

grid = {"MA": [(5,), (20,)],
        "STDDEV": [(5,), (20,)],
        "RSI": [(6,), (12,)],
        "MACD": [(12, 26)],
        "BBANDS": [(5,), (20,)],
        "MFI": [(14,)],
        "Chaikin": [()],
        "EMA": [(5,), (20,)],
        "KST": [(10, 10, 10, 15, 10, 15, 20, 30)],
        "TSI": [(25, 13)],
        "TRIX": [(5,), (20,)],
        "STOK": [()],
        "STO": [(5,), (20,)],
        "ROC": [(5,), (20,)],
        "PPSR": [()],
        "OBV": [(5,), (20,)],
        "MassI": [()],
        "MOM": [(1,)],
        "COPP": [(10,)],
        "ADX": [(14, 50)],
        "ATR": [(14,)],
        "FORCE": [(2,)],
        "ACCDIST": [(1,)]}


In [10]:
#serially apply every technical function in dictionary to an initial dataframe
def extract_technicals(df, tech_funcs, grid):
    
    output = df
    for name, func in tech_funcs.items():
        arg_list = grid[name]
        for arg_tuple in arg_list:
            output = func(output, *arg_tuple)
    
    return output

In [11]:
#serialize technical functions extraction objects
tech_func_tools = [tech_funcs, grid]

with open("lib/tech_func_tools.pkl", "wb") as dump_file:
    pickle.dump(tech_func_tools, dump_file)

### Extract technicals from every individual stock CSV

In [64]:
#grab list of csv names in the directory
individuals = os.listdir(path = "data/sandp500/individual_stocks_5yr/")

for csv in individuals:
    csv_path = "data/sandp500/individual_stocks_5yr/" + csv
    df = pd.read_csv(csv_path)
    try:
        df_technicals = extract_technicals(df, tech_funcs, grid)
        df_technicals.to_csv("data/sandp500/individual_stocks_5yr_TECHNICALS/" + csv)
    except IndexError:
        print(f"Technical extraction failed on {csv}")


### Testing `extract_technicals` on single stock csv

In [14]:
sp = pd.read_csv("data/sandp500/individual_stocks_5yr/A_data.csv")
sp_technicals = extract_technicals(sp, tech_funcs, grid)
sp_technicals.shape

(1258, 49)

### Clean the Data:

1. Drop the Acc/Dist_ROC_1 feature because it is only producing 0's
2. Drop all NaN values
3. Create a DataFrame with:
    - count *before* cleaning
    - count *after* cleaning
    - *difference* between the original and new count

In [117]:
#make an empty dict
shape_diff = {}

#create a list of the different tickers 
individuals = os.listdir(path = "data/sandp500/individual_stocks_5yr/")

#iterate through all of the csv files
for csv in individuals:
    #only create paths with .csv included in the list
    if '.csv' in csv:
        #create a path with each csv
        csv_path = "data/sandp500/individual_stocks_5yr_TECHNICALS/" + csv
        #create a df with path
        df = pd.read_csv(csv_path)
        #find the original number of observations
        orig_count = len(df.index)
        try:
            #clean the data by dropping the Acc/Dist_ROC_1 feature
            df = df.drop('Acc/Dist_ROC_1', axis=1)
            #drop all NaN values
            df = df.dropna()
            #save the cleaned df to csv in the individual_stocks_5yr_TECHNICALS_clean folder
            df.to_csv(f"data/sandp500/individual_stocks_5yr_TECHNICALS_clean/{csv}")
            #define the new count of the cleaned data 
            new_count = len(df.index)
            #find the difference between the original count and the cleaned count
            diff = orig_count - new_count
            #add the difference, original_count and cleaned_count to the shape_diff dict
            count_diff.update({f'{csv}':
                               {'difference': diff, 
                               'original_count':orig_shape, 
                               'cleaned_count':new_shape}})
        except:
            print(f"Technical cleaning failed on {csv}")


In [134]:
#create a df from the shape_diff dictionary and transpose it
obs_diff = pd.DataFrame(shape_diff).T


In [139]:
#show only the tickers where the original_count is less than 1000
obs_diff[obs_diff['original_count']<1000]

Unnamed: 0,cleaned_count,difference,original_count
AAL_data.csv,865,61,926
AGN_data.csv,903,71,974
ALLE_data.csv,869,71,940
AVGO_data.csv,326,61,387
BHF_data.csv,0,20,20
BHGE_data.csv,0,28,28
CCI_data.csv,598,71,669
CFG_data.csv,656,71,727
CHTR_data.csv,251,61,312
CSRA_data.csv,367,71,438


In [142]:
#show how many tickers have less than 1000 observations 
obs_diff[obs_diff['original_count']<1000].count()

cleaned_count     36
difference        36
original_count    36
dtype: int64