In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("features_df.csv")

df = df.dropna(axis='columns', how='all')

df = df.dropna()
df.head()

Unnamed: 0,actual,estimate,period,symbol,year,quarter,beat,f1_pup,f2_vup,f3_bothup,f4_average_price
0,1.0,1.13,2019-12-31,YUM,2019,4,0,0.390625,0.4375,0.1875,103.581563
1,0.8,0.94,2019-09-30,YUM,2019,3,0,0.46875,0.46875,0.171875,114.279531
2,0.93,0.87,2019-06-30,YUM,2019,2,1,0.619048,0.460317,0.253968,103.85381
3,0.82,0.81,2019-03-31,YUM,2019,1,1,0.57377,0.557377,0.327869,94.701148
4,0.4,0.9485,2018-12-31,YUM,2018,4,0,0.52381,0.444444,0.285714,89.395556


In [3]:
len(df)

3600

In [4]:
newdf = df[["symbol", "year", "quarter", "f1_pup", "f2_vup", "f3_bothup", "f4_average_price", "beat", "estimate"]]

In [5]:
df_train = newdf.loc[df["year"] <= 2016]
df_test = newdf.loc[df["year"] >= 2017]

In [8]:
X_train = df_train[['f1_pup', 'f2_vup', 'f3_bothup', 'f4_average_price', 'estimate']]
X_test = df_test[['f1_pup', 'f2_vup', 'f3_bothup', 'f4_average_price', 'estimate']]
y_train = df_train['beat']
y_test = df_test['beat']

In [9]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.svm import SVC 

model = SVC(kernel='linear')

In [11]:
model.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [12]:
print(f"Training Data Score: {model.score(X_train, y_train)}")
print(f"Testing Data Score: {model.score(X_test, y_test)}")

Training Data Score: 0.8003333333333333
Testing Data Score: 0.87


In [13]:
predictions = model.predict(X_test_scaled)

In [14]:
dd = pd.DataFrame({"Prediction": predictions, "Actual":y_test})
dd.head()

Unnamed: 0,Prediction,Actual
0,1,0
1,1,0
2,1,1
3,1,1
4,1,0


In [15]:
# To Confirm the accuracy of testing data
s = dd["Prediction"].count()
wrong = 0
for i in range(0,s):
    a=dd["Prediction"].iloc[i]
    b=dd["Actual"].iloc[i]
    if a != b:
        wrong = wrong + 1
        
(s - wrong) / s

0.87

In [16]:
df_check = dd.loc[dd["Prediction"] == 0]
df_check

Unnamed: 0,Prediction,Actual


# Hyperparameter Tuning

In [17]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 10], 'kernel' : ['rbf', 'linear', 'sigmoid'], 'gamma' : ['scale', 'auto']}

grid = GridSearchCV(model, param_grid, verbose = 3)

In [18]:
# Train the model with GridSearch
grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] C=1, gamma=scale, kernel=rbf ....................................
[CV] ........ C=1, gamma=scale, kernel=rbf, score=0.800, total=   0.2s
[CV] C=1, gamma=scale, kernel=rbf ....................................
[CV] ........ C=1, gamma=scale, kernel=rbf, score=0.800, total=   0.2s
[CV] C=1, gamma=scale, kernel=rbf ....................................
[CV] ........ C=1, gamma=scale, kernel=rbf, score=0.801, total=   0.2s
[CV] C=1, gamma=scale, kernel=linear .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ..... C=1, gamma=scale, kernel=linear, score=0.800, total=   0.0s
[CV] C=1, gamma=scale, kernel=linear .................................
[CV] ..... C=1, gamma=scale, kernel=linear, score=0.800, total=   0.0s
[CV] C=1, gamma=scale, kernel=linear .................................
[CV] ..... C=1, gamma=scale, kernel=linear, score=0.801, total=   0.0s
[CV] C=1, gamma=scale, kernel=sigmoid ................................
[CV] .... C=1, gamma=scale, kernel=sigmoid, score=0.800, total=   0.1s
[CV] C=1, gamma=scale, kernel=sigmoid ................................
[CV] .... C=1, gamma=scale, kernel=sigmoid, score=0.800, total=   0.1s
[CV] C=1, gamma=scale, kernel=sigmoid ................................
[CV] .... C=1, gamma=scale, kernel=sigmoid, score=0.801, total=   0.1s
[CV] C=1, gamma=auto, kernel=rbf .....................................
[CV] ......... C=1, gamma=auto, kernel=rbf, score=0.800, total=   0.1s
[CV] C=1, gamma=auto, kernel=rbf .....................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    4.4s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10], 'gamma': ['scale', 'auto'],
                         'kernel': ['rbf', 'linear', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [19]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
0.8003333333333333


In [20]:
predictions = grid.predict(X_test_scaled)

In [21]:
target_names = ["Not Beat", "Beat"]
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    Not Beat       0.00      0.00      0.00        78
        Beat       0.87      1.00      0.93       522

    accuracy                           0.87       600
   macro avg       0.43      0.50      0.47       600
weighted avg       0.76      0.87      0.81       600



  'precision', 'predicted', average, warn_for)


In [22]:
print(f"Training Data Score: {grid.score(X_train, y_train)}")
print(f"Testing Data Score: {grid.score(X_test, y_test)}")

Training Data Score: 0.8003333333333333
Testing Data Score: 0.87


In [23]:
gg = pd.DataFrame({"Prediction": predictions, "Actual":y_test})
gg.head()

Unnamed: 0,Prediction,Actual
0,1,0
1,1,0
2,1,1
3,1,1
4,1,0


In [24]:
s = gg["Prediction"].count()
wrong = 0
for i in range(0,s):
    a=gg["Prediction"].iloc[i]
    b=gg["Actual"].iloc[i]
    if a != b:
        wrong = wrong + 1
        
(s - wrong) / s

0.87