In [307]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [308]:
OM_data = pd.read_csv("./OpenMargin_target.csv")
PM_data = pd.read_csv("ProfitMargin_target.csv")
CF_data = pd.read_csv("./CF_target.csv")

In [309]:
OM_data = OM_data.drop(['index' ,"Date"],axis=1)
PM_data = PM_data.drop(['index' ,"Date"],axis=1)
CF_data = CF_data.drop(['index' ,"Date"],axis=1)

Convert the target into a percentage and make it a binary calssification for now 

In [310]:
OM_data_bin = OM_data
PM_data_bin = PM_data
CF_data_bin = CF_data

OM_data_bin['Target'] = OM_data_bin.OM_Target
PM_data_bin['Target'] = PM_data_bin.PM_Target
CF_data_bin['Target'] = CF_data_bin.CF_Target

In [259]:
OM_data_bin = OM_data_bin.drop(['OM_Target','Company'],axis=1)
PM_data_bin = PM_data_bin.drop(['PM_Target','Company'],axis=1)
CF_data_bin = CF_data_bin.drop(['CF_Target','Company'],axis=1)

In [302]:
OM_data_bin['Target']<-100

0        False
1        False
2        False
3        False
4        False
         ...  
50937    False
50938    False
50939    False
50940    False
50941    False
Name: Target, Length: 50942, dtype: bool

Preprocessing our variables :

1. Numeric : Standardise
2. Company : One-hot encode

In [260]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

## Operating Margin - Binary Classification

In [321]:
PM_data_1 = PM_data_bin[PM_data_bin["PM_Target"] >= -100]
PM_data_2 = PM_data_bin[PM_data_bin["PM_Target"] <= 100]
PM_removed_outliers= pd.merge(PM_data_1, PM_data_2, how='inner')


In [322]:
y_data = PM_removed_outliers[['Target']]
X_data = PM_removed_outliers.drop(['Target'],axis = 1)

# X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
X_train = X_data[:40000]
X_test = X_data[40000:]
y_train = y_data[:40000]
y_test = y_data[40000:]

X_train.dtypes

PE                           float64
PCF                          float64
DIV                          float64
PB                           float64
DEBTEQ                       float64
MKTVAL                       float64
Unemployment_rate            float64
CPI                          float64
GDP CURY Index               float64
NFIB                         float64
Financial_Stress_Index       float64
Consumer_confidence_index    float64
Jobless_Claims               float64
ISM_Manufacturing_index      float64
Company                       object
PM_Target                    float64
dtype: object

In [323]:
print(len(X_train))
print(len(X_test))

40000
9025


In [324]:
# scale target in training data

scaler = StandardScaler()
mean_train_y = np.mean(y_train)
std_train_y = np.std(y_train)
y_train = scaler.fit_transform(y_train)
# y_test = scaler.fit_transform(y_test)

In [326]:
numeric_features = ['PE', 'PCF' ,'DIV' , 'PB','DEBTEQ' , 'MKTVAL' ,'Unemployment_rate' , 'CPI','GDP CURY Index', 'NFIB','Financial_Stress_Index','Consumer_confidence_index','Jobless_Claims','ISM_Manufacturing_index']
#categorical_features = ['company']

In [327]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])
#categorical_transformer = Pipeline(steps=[
    #('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [328]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])
        #('cat', categorical_transformer, categorical_features)])

Regression Models with scores:

1. LinearRegression 


In [334]:
linear_regression = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', LinearRegression())])
linear_regression.fit(X_train, y_train)
y_pred = linear_regression.predict(X_test)
y_pred_transformed = (y_pred.flatten() + mean_train_y.values[0] )*std_train_y.values[0]
print("mean_absolute_error : " )
print(mean_absolute_error(y_test, y_pred_transformed))


mean_absolute_error : 
113.76244786295597


Grading Boosting Regressor

In [333]:
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

gb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', GradientBoostingRegressor(learning_rate=0.001,n_estimators = 25,max_depth=10,min_samples_split = 20))])
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
y_pred_transformed = (y_pred.flatten() + mean_train_y.values[0] )*std_train_y.values[0]
print("mean_absolute_error : " )
print(mean_absolute_error(y_test, y_pred))


mean_absolute_error : 
13.59689334319191


Decision Tree 

In [336]:
from sklearn.tree import DecisionTreeRegressor
decision_tree = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', DecisionTreeRegressor())])
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)
y_pred_transformed = (y_pred.flatten() + mean_train_y.values[0] )*std_train_y.values[0]
print("mean_absolute_error : " )
print(mean_absolute_error(y_test, y_pred))


mean_absolute_error : 
13.510468004396303


Ada Boost

In [341]:
from sklearn.ensemble import AdaBoostRegressor
nn = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', AdaBoostRegressor(random_state=0, n_estimators=100))])
nn.fit(X_train, y_train)
y_pred = nn.predict(X_test)
y_pred_transformed = (y_pred.flatten() + mean_train_y.values[0] )*std_train_y.values[0]
print("mean_absolute_error : " )
print(mean_absolute_error(y_test, y_pred))

mean_absolute_error : 
13.604009653759503


In [348]:
from sklearn.linear_model import Ridge
sgd = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', Ridge())])
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
y_pred_transformed = (y_pred.flatten() + mean_train_y.values[0] )*std_train_y.values[0]
print("mean_absolute_error : " )
print(mean_absolute_error(y_test, y_pred))

mean_absolute_error : 
13.609124719652282


In [354]:
regressors = [
    LinearRegression(),
    GradientBoostingRegressor(learning_rate=0.001,n_estimators = 25,max_depth=10,min_samples_split = 20),
    DecisionTreeRegressor(),
    AdaBoostRegressor(random_state=0, n_estimators=100),
    Ridge()]


for clf in regressors:
    nn= Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', clf)])
    nn.fit(X_train, y_train)
  
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = nn.predict(X_test)
    y_pred_transformed = (train_predictions.flatten() + mean_train_y.values[0] )*std_train_y.values[0]
    print("mean_absolute_error : " )
    print(mean_absolute_error(y_test, y_pred))
    
print("="*30)

LinearRegression
****Results****
mean_absolute_error : 
13.609124719652282
GradientBoostingRegressor
****Results****
mean_absolute_error : 
13.609124719652282
DecisionTreeRegressor
****Results****
mean_absolute_error : 
13.609124719652282
AdaBoostRegressor
****Results****
mean_absolute_error : 
13.609124719652282
Ridge
****Results****
mean_absolute_error : 
13.609124719652282
