In [18]:
import json
import os
import hashlib
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Optional, Union, List, Dict, Callable, Tuple
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

# SF1 Quarterly

In [19]:
def _load_df(excel_path: str) -> pd.DataFrame:
    '''
    Load data from SF1 .xlsx file and convert it to pd.DataFrame
    
    Parameters
    ----------
    xlsx_path:
        path to SF1 .xlsx path
        
    Returns
    -------
        ``pd.DataFrame`` content of file
    '''
    df = pd.read_excel(excel_path)
    df = df.drop('Unnamed: 0', 1)
    df = df.drop('Unnamed: 0.1', 1)

    df = df.infer_objects()
    
    return df

In [20]:
data = _load_df('report data/A.xlsx')

In [21]:
class SF1QuarterlyData: 
    '''
    Loader for quartely fundamental information about
    companies(debt, revenue etc)
    '''
    def __init__(self,
                 data_path: Optional[str]=None,
                 quarter_count: Optional[int]=None):
        '''
        Parameters
        ----------
        data_path:
            path to :mod:`~ml_investment.data_loaders.sf1` dataset folder
            If None, than will be used ``sf1_data_path``
            from `~/.ml_investment/config.json`
        quarter_count:
            maximum number of last quarters to return. 
            Resulted number may be less due to short history in some companies
        '''
        if data_path is None:
            data_path = load_config()['sf1_data_path']
        self.data_path = data_path
        self.quarter_count = quarter_count


    def load(self, index: List[str]) -> pd.DataFrame:
        '''    
        Parameters
        ----------
        index:
            list of tickers to load data for, i.e. ``['AAPL', 'TSLA']``
           
        Returns
        -------
        ``pd.DataFrame``
            quarterly information about companies
        '''
        result = []
        for ticker in index:
            path = '{}/{}.xlsx'.format(self.data_path, ticker)
            if not os.path.exists(path):
                continue
            df = _load_df(path)
            if self.quarter_count is not None:
                df = df[:self.quarter_count]

            df['date'] = df['Date']
            df = df.sort_values('date', ascending=False)
            result.append(df)
        
        if len(result) == 0:
            return None

        result = pd.concat(result, axis=0).reset_index(drop=True)
        result = result.infer_objects()
        result['date'] = result['date'].astype(np.datetime64) 
     
        return result


    def existing_index(self):
        '''  
        Returns
        -------
        ``List``
            existing index values that can pe pushed to `load`
        '''
        index = [x.split('.xlsx')[0] for x in os.listdir(data_path)]
        return index

In [22]:
sf1 = SF1QuarterlyData('report data')

# SF1 Base

In [23]:
class SF1BaseData: 
    '''
    Load base information about company(like sector, industry etc)
    '''
    def __init__(self, data_path: Optional[str]=None):
        '''
        Parameters
        ----------
        data_path:
            path to :mod:`~ml_investment.data_loaders.sf1` dataset folder
            If None, than will be used ``sf1_data_path``
            from `~/.ml_investment/config.json`
        '''
        if data_path is None:
            data_path = load_config()['sf1_data_path']
        self.data_path = data_path


    def load(self, index: Optional[List[str]]=None) -> pd.DataFrame:
        '''
        Parameters
        ----------
        index:
            list of ticker to load data for, i.e. ``['AAPL', 'TSLA']`` 
            OR ``None`` (loading for all possible tickers)
        Returns
        -------
        ``pd.DataFrame`` 
            base companies information
        '''
        
        path = '{}/base_data.xlsx'.format(self.data_path)
        df = pd.read_excel(path)
        
        result = []
        for ticker in index:
            temp_df = df[df['ticker'] == ticker]
            result.append(temp_df)
        
        if len(result) == 0:
            return None

        result = pd.concat(result, axis=0).reset_index(drop=True)
        return result
            
        
        
    def existing_index(self):
        
        '''  
        Returns
        -------
        ``List``
            existing index values that can pe pushed to `load`
        '''
        
        index = [x.split('.xlsx')[0] for x in os.listdir('report data')]
        return index

In [24]:
sf2 = SF1BaseData('base data')

In [25]:
sf2.load(['A', 'TSLA'])

Unnamed: 0,ticker,sector,industry,sicindustry
0,A,Industrial,Electronics,Life Sciences Tools & Services
1,TSLA,"Consumer, Cyclical",Auto Manufacturers,Automobiles


# Quartrely Feature

In [26]:
def calc_series_stats(series: Union[List[float], np.array],
                      stats: Dict[str, Callable]={'mean': np.mean,
                                                  'median': np.median,
                                                  'max': np.max,
                                                  'min': np.min,
                                                  'std': np.std},
                      name_prefix: str='',
                      norm: bool=False) -> Dict[str, float]:
    '''
    Calculate base statistics on series
            
    Parameters
    ----------
    series:
        series by which statistics are calculated
    name_prefix:
        string prefix of returned features
    norm:
        normilize resulted statistics to first element or not
        
    Returns
    -------
        Dict with calculated features 
    '''
    series = np.array(series).astype('float')
    series = series[~np.isnan(series)] 
    series = list(series)
    if len(series) == 0:
        series = np.array([np.nan])
        
    result = {'{}_{}'.format(name_prefix, key): stats[key](series) 
              for key in stats}
    
    if norm:
        result = {key: result[key] / np.abs(series[0]) for key in result}
    
    return result

In [27]:
class QuarterlyFeatures:
    '''
    Feature calculator for qaurtrly-based statistics. 
    Return features for company quarter slices.
    '''
    def __init__(self,
                 data_key: str,
                 columns: List[str],
                 quarter_counts: List[int]=[2, 4, 10],
                 max_back_quarter: int=10,
                 min_back_quarter: int=0,
                 stats: Dict[str, Callable]={'mean': np.mean,
                                             'median': np.median,
                                             'max': np.max,
                                             'min': np.min,
                                             'std': np.std},
                 calc_stats_on_diffs: bool=True,
                 data_preprocessing: Callable=None,
                 n_jobs: int=cpu_count(),
                 verbose: bool=False):
        '''     
        Parameters
        ----------
        data_key:
            key of dataloader in ``data`` argument during 
            :func:`~ml_investment.features.QuarterlyFeatures.calculate`
        columns:
            column names for feature calculation(like revenue, debt etc)
        quarter_counts:
            list of number of quarters for statistics calculation. 
            e.g. if ``quarter_counts = [2]`` than statistics will be calculated
            on current and previous quarter
        max_back_quarter:
            max bound of company slices in time. 
            If ``max_back_quarter = 1`` than features will be calculated
            for only current company quarter. 
            If max_back_quarter is larger than total number of
            quarters for company than features will be calculated 
            for all quarters 
        min_back_quarter:
            min bound of company slices in time. 
            If ``min_back_quarter = 0`` (default) than features will be calculated
            for all quarters. 
            If ``min_back_quarter = 2`` than current and previous quarter slices 
            will not be used for feature calculation 
        stats:
            aggregation functions for features calculation.
            Should be as ``Dict[str, Callable]``.
            Keys of this dict will be used as features names prefixes.
            Values of this dict should implement 
            ``foo(x:List) -> float`` interface
        calc_stats_on_diffs:
            calculate statistics on series diffs( ``np.diff(series)`` ) or not
        data_preprocessing:
            function implemening ``foo(x) -> x_`` interface. 
            It will be used before feature calculation.
        n_jobs:
            number of threads for calculation
        verbose:
            show progress or not
        '''
        self.data_key = data_key
        self.columns = columns
        self.quarter_counts = quarter_counts
        self.max_back_quarter = max_back_quarter
        self.min_back_quarter = min_back_quarter
        self.stats = stats
        self.calc_stats_on_diffs = calc_stats_on_diffs
        self.data_preprocessing = data_preprocessing
        self.n_jobs = n_jobs
        self.verbose = verbose
        self._data_loader = None
        

    def _calc_series_feats(self, data: pd.DataFrame,
                           str_prefix: str='') -> Dict[str, float]:
        result = {}
        for quarter_cnt in self.quarter_counts:
            for col in self.columns:
                series = data[col].values[:quarter_cnt][::-1].astype('float')
                name_prefix = 'quarter{}_{}'.format(quarter_cnt, col)

                feats = calc_series_stats(series=series,
                                          stats=self.stats,
                                          name_prefix=name_prefix)
                result.update(feats)

                if self.calc_stats_on_diffs:
                    diff_feats = calc_series_stats(series=np.diff(series),
                                                   stats=self.stats,
                                                   name_prefix='{}_diff'\
                                                    .format(name_prefix))
                    result.update(diff_feats)
                                
        return result  
        
        
    def _single_ticker(self, ticker:str) -> List[Dict[str, float]]:
        result = []
        quarterly_data = self._data_loader.load([ticker])
        
        if quarterly_data is None:
            return result

        if self.data_preprocessing is not None:
            quarterly_data = self.data_preprocessing(quarterly_data)
        
        max_back_quarter = min(self.max_back_quarter, len(quarterly_data) - 1)
        min_back_quarter = min(self.min_back_quarter, len(quarterly_data) - 1)
        assert min_back_quarter <= max_back_quarter
        
        for back_quarter in range(min_back_quarter, max_back_quarter):
            curr_data = quarterly_data[back_quarter:]

            feats = {
                'ticker': ticker, 
                'date': curr_data['date'].values[0],
            }

            series_feats = self._calc_series_feats(curr_data)
            feats.update(series_feats)
            
            result.append(feats)
           
        return result
        
        
    def calculate(self, data: Dict, index: List[str]) -> pd.DataFrame:
        '''     
        Interface to calculate features for tickers 
        based on data
        
        Parameters
        ----------
        data:
            dict having field named as value in ``data_key`` param of 
            :func:`~ml_investment.features.QuarterlyFeatures.__init__`
            This field should contain class implementing
            ``load(index) -> pd.DataFrame`` interface
        index:
            list of tickers to calculate features for, i.e. ``['AAPL', 'TSLA']``
                      
        Returns
        -------
        ``pd.DataFrame``
            resulted features with index ``['ticker', 'date']``.
            Each row contains features for ``ticker`` company 
            at ``date`` quarter
        '''
        if self.verbose:
            print("Quarterly features calculation")

        self._data_loader = data[self.data_key]
        X = []
        for ticker in index:
            X.extend(self._single_ticker(ticker))

        X = pd.DataFrame(X).set_index(['ticker', 'date'])
        
        return X

In [28]:
data = {}
data['quarterly'] = SF1QuarterlyData('report data')
data['base'] = SF1BaseData('base data')

In [29]:
fc1 = QuarterlyFeatures(data_key='quarterly',
                            columns=['Revenue','Net Debt'],
                            quarter_counts=[2,4,10],
                            max_back_quarter=40,
                            min_back_quarter=0,
                            verbose=True)

# Base Feature

In [30]:
class HashingEncoder:
    def transform(self, vals):
        result = [int_hash_of_str(str(x)) for x in vals]
        return result

In [31]:
def int_hash_of_str(text:str):
    return int(hashlib.md5(text.encode('utf-8')).hexdigest()[:8], 16)

In [32]:
class BaseCompanyFeatures:
    '''
    Feature calculator for getting base
    company information(sector, industry etc). 
    Encode categorical columns via hashing label encoding. 
    Return features for current company state.
    '''
    def __init__(self, 
                 data_key:str,
                 cat_columns:List[str],
                 verbose: bool=False):
        '''     
        Parameters
        ----------
        data_key:
            key of dataloader in ``data`` argument during 
            :func:`~ml_investment.features.BaseCompanyFeatures.calculate`
        cat_columns:
            column names of categorical features for encoding
        verbose:
            show progress or not
        '''
        self.data_key = data_key
        self.cat_columns = cat_columns
        self.verbose = verbose
        self.he = HashingEncoder()

    def calculate(self, data: Dict, index: List[str]) -> pd.DataFrame:
        '''     
        Interface to calculate features for tickers 
        based on data
        
        Parameters
        ----------
        data:
            dict having field named as value in ``data_key`` param of 
            :func:`~ml_investment.features.BaseCompanyFeatures.__init__`
            This field should contain class implementing
            ``load(index) -> pd.DataFrame`` interface
        index:
            list of tickers to calculate features for, i.e. ``['AAPL', 'TSLA']``
                      
        Returns
        -------
        ``pd.DataFrame``
            resulted features with index ``['ticker']``.
            Each row contains features for ``ticker`` company
        '''
        if self.verbose:
            print("Base features calculation")

        base_df = data[self.data_key].load(index)
        for col in self.cat_columns:
            base_df[col] = base_df[col].fillna('None')
            base_df[col] = self.he.transform(base_df[col])        
          
           
        result = pd.DataFrame()
        result['ticker'] = index
        
        result = pd.merge(result, base_df[['ticker'] + self.cat_columns],
                          on='ticker', how='left')
        
        result = result.set_index(['ticker'])
        
        return result

In [33]:
fc2 = BaseCompanyFeatures(data_key='base',
                              cat_columns=['sector', 'industry', 'sicindustry'],
                              verbose=True)

# Quarterly target

In [34]:
class QuarterlyTarget:
    '''
    Calculator of target represented as column in quarter-based data.
    Work with quarterly slices of company.
    '''
    def __init__(self, 
                 data_key: str,
                 col: str, 
                 quarter_shift: int=0,
                 n_jobs: int=cpu_count()):
        '''     
        Parameters
        ----------
        data_key:
            key of dataloader in ``data`` argument during 
            :func:`~ml_investment.targets.QuarterlyTarget.calculate`
        col:
            column name for target calculation(like marketcap, revenue)
        quarter_shift:
            number of quarters to shift. 
            e.g. if ``quarter_shift = 0`` than value for current quarter 
            will be returned. 
            If ``quarter_shift = 1`` than value for next quarter 
            will be returned.
            If ``quarter_shift = -1`` than value for previous quarter 
            will be returned.
        '''
        self.data_key = data_key
        self.col = col
        self.quarter_shift = quarter_shift
        self.n_jobs = n_jobs
        self._data_loader = None
        
        
    def _single_ticker_target(self, 
                              ticker_and_dates: Tuple[str,
                                                      List]) -> pd.DataFrame:
        ticker, dates = ticker_and_dates
        quarterly_data = self._data_loader.load([ticker])[::-1]
        quarter_dates = quarterly_data['date'].astype(np.datetime64).values
        vals = []
        for date in dates:
            assert np.datetime64(date) in quarter_dates
            curr_date_mask = quarter_dates == np.datetime64(date)
            curr_quarter_idx = np.where(curr_date_mask)[0][0]
            idx = curr_quarter_idx + self.quarter_shift
            if idx >= 0 and idx < len(quarterly_data):
                value = quarterly_data[self.col].values[idx]
            else:
                value = np.nan
                
            vals.append(value)

        result = pd.DataFrame()
        result['y'] = vals
        result['date'] = dates
        result['ticker'] = ticker

        return result        
        

    def calculate(self, data: Dict, index: pd.DataFrame) -> pd.DataFrame:
        '''     
        Interface to calculate targets for dates and tickers 
        in index parameter based on data
        
        Parameters
        ----------
        data:
            dict having field named as value in ``data_key`` param of 
            :func:`~ml_investment.targets.QuarterlyTarget.__init__`
            This field should contain class implementing
            ``load(index) -> pd.DataFrame`` interface
        index:
            ``pd.DataFrame`` containing information of tickers and dates
            to calculate targets for. 
            Should have columns: ``["ticker", "date"]``         
                        
        Returns
        -------
        ``pd.DataFrame``
            targets having 'y' column. Index of this dataframe has the same
            values as ``index`` param.
            Each row contains target for ``ticker`` company 
            at ``date`` quarter
        '''
        self._data_loader = data[self.data_key]
        grouped = index.groupby('ticker')['date'].apply(lambda x:
                  x.tolist()).reset_index()
        params = [(ticker, dates) for ticker, dates in grouped.values]

        result = []
        for ticker_result in params:
            result.append(self._single_ticker(ticker_result))

        result = pd.concat(result, axis=0)
        result = result.drop_duplicates(['ticker', 'date'])
        result = pd.merge(index, result, on=['ticker', 'date'], how='left')
        result = result.set_index(['ticker', 'date'])
        result = result.infer_objects()
        
        return result