# imports

In [15]:
# imports
import pandas as pd

import numpy as np
from sklearn.pipeline import Pipeline

from src.utils import utils
from src.data_processing import preprocessing as prep
import logging

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read Data

In [16]:
root_dir = utils.get_proj_root()
raw_data =  pd.read_csv(root_dir.joinpath('data/raw/Stock_data.csv'))
raw_data.head()

Unnamed: 0,year,industry,symbol,currentRatio,currentRatio_percentage_change,quickRatio,quickRatio_percentage_change,cashRatio,cashRatio_percentage_change,daysOfSalesOutstanding,...,dividendYield_percentage_change,enterpriseValueMultiple,enterpriseValueMultiple_percentage_change,priceFairValue,priceFairValue_percentage_change,interestRate,interestRate_percentage_change,adjDividend,dps_growth,dps_change_next_year
0,2012,Conglomerates,MMM,2.198387,-2.275946,1.385806,-0.090461,0.465,14.018251,49.567449,...,-5.500411,8.692139,5.900833,3.571431,-2.168468,0.14,40.0,2.36,7.272727,2.0
1,2013,Conglomerates,MMM,1.698186,-22.753086,1.01227,-26.95445,0.344225,-25.973091,50.284895,...,-28.717392,12.326051,41.806877,5.328531,49.198768,0.11,-21.428571,2.54,7.627119,2.0
2,2014,Conglomerates,MMM,1.961487,15.504836,1.127209,11.354593,0.316272,-8.120568,48.611609,...,14.836058,13.065263,5.997151,8.117223,52.335098,0.09,-18.181818,3.42,34.645669,2.0
3,2015,Conglomerates,MMM,1.543411,-21.314241,0.852768,-24.346986,0.252599,-20.132363,50.082909,...,30.819267,12.318266,-5.717433,8.022507,-1.166858,0.13,44.444444,4.1,19.883041,2.0
4,2016,Conglomerates,MMM,1.885512,22.16526,1.136839,33.311664,0.385593,52.650039,53.242552,...,-8.738086,13.479738,9.428861,10.440035,30.134325,0.4,207.692308,4.44,8.292683,2.0


In [17]:
raw_data.describe()

Unnamed: 0,year,currentRatio,currentRatio_percentage_change,quickRatio,quickRatio_percentage_change,cashRatio,cashRatio_percentage_change,daysOfSalesOutstanding,daysOfSalesOutstanding_percentage_change,daysOfInventoryOutstanding,...,dividendYield_percentage_change,enterpriseValueMultiple,enterpriseValueMultiple_percentage_change,priceFairValue,priceFairValue_percentage_change,interestRate,interestRate_percentage_change,adjDividend,dps_growth,dps_change_next_year
count,4790.0,4790.0,4790.0,4790.0,4790.0,4790.0,4790.0,4790.0,4790.0,4790.0,...,4790.0,4790.0,4790.0,4790.0,4790.0,4790.0,4790.0,4625.0,4567.0,4625.0
mean,2016.5,115.975409,12033.82,47.01396,476.1163,20.834161,584.2458,82.152941,11.84691,19.926891,...,115.454267,16.115372,12.808659,4.049913,15.130181,0.632,34.220437,1.507595,22.613096,1.414486
std,2.872581,4015.352605,621112.8,1271.940109,19058.13,670.792298,19544.7,197.655285,187.975607,8663.814781,...,2637.442157,236.081319,1104.11141,62.434934,805.82346,0.73409,88.625633,2.486802,191.1561,0.877244
min,2012.0,0.0,-100.0,-1.480985,-694.4514,0.0,-100.0,-79.409628,-424.232944,-584907.567568,...,-100.0,-8607.583336,-63039.175197,-1698.564205,-31192.181942,0.08,-82.407407,0.0,-100.0,0.0
25%,2014.0,0.995836,-13.73288,0.58489,-17.3694,0.1298,-29.38055,33.489167,-6.904695,3.152653,...,-14.721281,9.266166,-11.395042,1.761681,-10.050483,0.11,-21.428571,0.26,0.0,0.0
50%,2016.5,1.449722,-0.9821253,0.99741,-1.349011,0.341109,-0.6881043,53.137646,0.397062,40.316103,...,0.0,12.954092,4.433934,3.163905,7.590405,0.26,29.016393,1.08,6.361323,2.0
75%,2019.0,2.331802,13.64322,1.682155,17.09117,0.757941,36.31823,72.332846,8.991844,83.114873,...,14.658464,18.301095,22.565907,5.942402,25.866923,1.0,83.0,2.05,14.641914,2.0
max,2021.0,207015.259574,42460340.0,63793.868085,1257160.0,33633.564748,1257160.0,5417.763945,11106.159855,68880.951016,...,136183.666595,10737.077141,31215.695305,1300.629474,44949.670649,2.16,207.692308,104.48,9900.0,2.0


# Preprocessing

## steps
- remove rows with NA in label columns
- remove higly orrelated variables
- 

In [18]:
log_fmt = '%(asctime)s - %(levelname)s - %(name)s  - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)

label_col_name = 'dps_change_next_year'
collinear_thresh = 0.98
categorical_features = ['industry', 'symbol']

transform_pipeline = Pipeline([
    ('drop_rows_with_NA_in_label', prep.NARoWRemover(cols_to_check=label_col_name)),
    ('drop_rows_with_NA_in_col', prep.NARoWRemover(cols_to_check="dps_growth")),
    ('drop_collinear_columns', prep.CollinearColsRemover(thresh=collinear_thresh, label_col=label_col_name)),
    ('cat_to_ordinal_cols', prep.ColumnsOrdinalEncoder(col_names=categorical_features)),
    ('binarize label column', prep.BinarizeCol(col_name=label_col_name, true_val=1)),
    # ('Balance data', prep.SMOTEBalancer(label_col_name=label_col_name, random_state=None))
])

transform_pipeline.fit(raw_data)
result = transform_pipeline.transform(raw_data)
# print(result.shape), print(raw_data.shape)
result.head()

2023-11-10 14:41:40,681 - INFO - NARoWRemover  - dropped 165 rows with NA in columns: dps_change_next_year
2023-11-10 14:41:40,684 - INFO - NARoWRemover  - dropped 58 rows with NA in columns: dps_growth


2023-11-10 14:41:40,808 - INFO - CollinearColsRemover  - dropped 27 cols
2023-11-10 14:41:40,816 - INFO - NARoWRemover  - dropped 165 rows with NA in columns: dps_change_next_year
2023-11-10 14:41:40,820 - INFO - NARoWRemover  - dropped 58 rows with NA in columns: dps_growth
2023-11-10 14:41:40,823 - INFO - CollinearColsRemover  - dropped 27 cols
2023-11-10 14:41:40,830 - INFO - BinarizeCol  - bianrized dps_change_next_year


dps_change_next_year


Unnamed: 0,year,industry,symbol,currentRatio,currentRatio_percentage_change,quickRatio,quickRatio_percentage_change,daysOfSalesOutstanding,daysOfSalesOutstanding_percentage_change,daysOfInventoryOutstanding,...,priceEarningsToGrowthRatio_percentage_change,dividendYield,dividendYield_percentage_change,enterpriseValueMultiple,enterpriseValueMultiple_percentage_change,interestRate,interestRate_percentage_change,adjDividend,dps_growth,dps_change_next_year
0,2012,24,291,2.198387,-2.275946,1.385806,-0.090461,49.567449,3.987852,89.289449,...,6.939145,0.025377,-5.500411,8.692139,5.900833,0.14,40.0,2.36,7.272727,0
1,2013,24,291,1.698186,-22.753086,1.01227,-26.95445,50.284895,1.447414,87.567366,...,21.913077,0.018089,-28.717392,12.326051,41.806877,0.11,-21.428571,2.54,7.627119,0
2,2014,24,291,1.961487,15.504836,1.127209,11.354593,48.611609,-3.327613,82.245394,...,-39.851336,0.020773,14.836058,13.065263,5.997151,0.09,-18.181818,3.42,34.645669,0
3,2015,24,291,1.543411,-21.314241,0.852768,-24.346986,50.082909,3.026645,83.473315,...,799.567847,0.027175,30.819267,12.318266,-5.717433,0.13,44.444444,4.1,19.883041,0
4,2016,24,291,1.885512,22.16526,1.136839,33.311664,53.242552,6.308824,82.149269,...,-84.149916,0.024801,-8.738086,13.479738,9.428861,0.4,207.692308,4.44,8.292683,0


In [19]:
result.isna().sum()

year                              0
industry                          0
symbol                            0
currentRatio                      0
currentRatio_percentage_change    0
                                 ..
interestRate                      0
interestRate_percentage_change    0
adjDividend                       0
dps_growth                        0
dps_change_next_year              0
Length: 89, dtype: int64

In [20]:
transform_pipeline