In [91]:
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import pandas as pd
import feature_engineering
import numpy as np

In [71]:
news_and_finance_data = pd.read_csv('data/news_and_finance_date.csv')
news_and_finance_data = news_and_finance_data.fillna(0)
y_data = news_and_finance_data['MONTHLY_RETURN']
yf1_data = news_and_finance_data['MONTHLY_RETURN_F1']
to_drop = ['security', 'date_x', 'date_y', 'month', 'MONTHLY_RETURN', 'MONTHLY_RETURN_F1', 'RP_ENTITY_ID', 'RP_STORY_EVENT_INDEX', ]
x_data = news_and_finance_data.drop(to_drop, axis=1)
for column in x_data.columns:
    x_data[column] = feature_engineering.data_normalization(x_data, column)

In [72]:
x_data

Unnamed: 0,LTM_REVENUE,LTM_EPS,LTM_OPERATING_INCOME,LTM_OPERATING_MARGIN,EV,NTM_PE,NTM_REVENUE,NTM_EPS,NTM_OPERATING_MARGIN,NTM_EV_EBITDA,...,NORMAL_RELEVANCE,NORMAL_EVENT_SENTIMENT_SCORE,NORMAL_EVENT_RELEVANCE,NORMAL_EVENT_SIMILARITY_DAYS,ENCODED_TOPIC,ENCODED_GROUP,ENCODED_TYPE,ENCODED_FACT_LEVEL,ENCODED_NEWS_TYPE,ENCODED_CATEGORY
0,-0.349826,-0.406300,-0.533473,-0.875438,-0.361772,0.198435,-0.296221,-0.386191,0.0,0.194241,...,-0.033981,-0.040016,-0.041283,-0.108656,0.135116,0.056121,-0.022631,-0.358892,0.294990,-0.116771
1,-0.349826,-0.406300,-0.533473,-0.875438,-0.361772,0.284612,-0.295957,-0.385559,0.0,0.296214,...,-0.033981,-0.040016,-0.041283,-0.108656,0.135116,0.056121,-0.022631,-0.358892,0.294990,-0.116771
2,-0.333511,-0.401046,-0.520964,-0.868992,-0.269748,0.418224,-0.294799,-0.385980,0.0,0.461465,...,-0.033981,-0.040016,-0.041283,-0.108656,0.135116,0.056121,-0.022631,-0.358892,0.294990,-0.116771
3,-0.333511,-0.401046,-0.520964,-0.868992,-0.269748,0.382380,-0.271021,-0.366532,0.0,0.403480,...,-0.033981,-0.040016,-0.041283,-0.108656,0.135116,0.056121,-0.022631,-0.358892,0.294990,-0.116771
4,-0.333511,-0.401046,-0.520964,-0.868992,-0.269748,0.346410,-0.270402,-0.366181,0.0,0.342496,...,-0.033981,-0.040016,-0.041283,-0.108656,0.135116,0.056121,-0.022631,-0.358892,0.294990,-0.116771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3535,-0.645667,-0.288345,-0.333594,1.541500,-0.199113,0.162245,-0.620119,-0.284174,0.0,0.449939,...,-0.033981,-0.040016,-0.041283,-0.108656,0.135116,0.056121,-0.022631,-0.358892,0.294990,-0.116771
3536,-0.643309,-0.279773,-0.336042,1.470019,-0.190385,0.119849,-0.620005,-0.283613,0.0,0.397541,...,-0.033981,-0.040016,-0.041283,-0.108656,0.135116,0.056121,-0.022631,-0.358892,0.294990,-0.116771
3537,-0.643309,-0.279773,-0.336042,1.470019,-0.190385,0.080919,-0.617242,-0.265288,0.0,0.375782,...,0.891183,4.237260,1.723057,0.246883,0.041571,-2.276877,1.024424,3.043979,-1.496509,-0.941538
3538,-0.643309,-0.280252,-0.336042,1.470019,-0.190385,0.183537,-0.617337,-0.265077,0.0,0.467875,...,0.891183,3.423046,-0.477970,-1.317770,-0.571958,7.285802,6.085864,2.880228,-3.979897,-0.489622


In [92]:
classifiers = [
    ('Decision Tree', DecisionTreeRegressor()),
    ('SVM', SVR()),
    ('Linear Regression', LinearRegression()),
    ('Stochastic Gradient Descent', SGDRegressor())
    ]

clf_columns = [['Name',
                'Accuracy Mean', 'Accuracy Mean',
                'Accuracy Std', 'Accuracy Std',
                'Precision Mean', 'Precision Mean',
                'Recall Mean', 'Recall Mean'],
               ['', 'Train', 'Test', 'Train', 'Test', 'Train', 'Test', 'Train', 'Test']]
clf_table = pd.DataFrame(columns = clf_columns)

In [93]:
clf_table

Unnamed: 0_level_0,Name,Accuracy Mean,Accuracy Mean,Accuracy Std,Accuracy Std,Precision Mean,Precision Mean,Recall Mean,Recall Mean
Unnamed: 0_level_1,Unnamed: 1_level_1,Train,Test,Train,Test,Train,Test,Train,Test


In [94]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

In [95]:
x_train.shape

(2832, 68)

In [102]:
for classifier in classifiers:
    clf_name, clf = classifier
    pipe = make_pipeline(clf)
    pipe.fit(x_train, y_train)
    prediction = pipe.predict(x_test)
    mse = np.square(np.subtract(prediction,y_test)).mean()
    print('Classifer:', clf_name)
    print('Mean Square Error:', mse)

Classifer: Decision Tree
Mean Square Error: 0.012256303239506503
Classifer: SVM
Mean Square Error: 0.006825335849747837
Classifer: Linear Regression
Mean Square Error: 0.006867854243245472
Classifer: Stochastic Gradient Descent
Mean Square Error: 311.4709236409174


In [45]:
clf_table.sort_values(by = [('Accuracy Mean', 'Test')], ascending = False, inplace = False)
clf_table

Unnamed: 0_level_0,Name,Accuracy Mean,Accuracy Mean,Accuracy Std,Accuracy Std,Precision Mean,Precision Mean,Recall Mean,Recall Mean,Unnamed: 10_level_0
Unnamed: 0_level_1,Unnamed: 1_level_1,Train,Test,Train,Test,Train,Test,Train,Test,Name
0,,,,,,,,,,Logistic Regression
1,,,,,,,,,,Random Forest
2,,,,,,,,,,SVM
3,,,,,,,,,,Linear SVM
4,,,,,,,,,,Naive Bayes
5,,,,,,,,,,K Nearest
