

# SVC + Grid Search

In [None]:
# !pip install imblearn

In [None]:
# !pip install -U threadpoolctl

In [None]:
# !pip install imbalanced-learn

In [None]:
# !pip install scikit-learn --user

In [1]:
# Import dependencies
import gc
import time
import numpy as np
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

### Model training and hyperparameter tuning

In [2]:
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

In [3]:
# Read Data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train_labels = pd.read_csv("train_labels.csv")
test_labels = pd.read_csv("test_labels.csv")

In [19]:
train.shape, train_labels.shape

((783, 30), (783, 1))

In [20]:
train

Unnamed: 0,CoefficientVariationofStockPrice,PercentageChangefromprevreportedEPS,PriceChange,QuestionsSentiment,ConferenceSentiment,Has_QnA,Reddit_Sentiment,RedditSD,TwitterSentiment,TwitterSD,...,Energy,Financial Services,Healthcare,Industrials,Real Estate,Technology,quarter_1,quarter_2,quarter_3,quarter_4
0,0.307267,-0.557576,1.409556,-0.714286,0.578947,1,0.068178,0.423146,0.219354,0.441965,...,0,0,0,0,0,0,1.0,0.0,0.0,0.0
1,0.242076,0.306452,1.378303,-0.666667,0.466667,1,0.062141,0.433917,0.082585,0.451483,...,1,0,0,0,0,0,1.0,0.0,0.0,0.0
2,0.043960,0.057471,-0.009751,-0.750000,0.652174,1,0.000000,0.000000,0.415066,0.440203,...,0,0,0,0,0,0,1.0,0.0,0.0,0.0
3,0.106001,0.000000,-0.025346,-0.714286,0.566667,1,0.219251,0.441184,0.145072,0.412363,...,0,0,0,0,0,1,0.0,0.0,0.0,1.0
4,0.036320,-0.256881,-0.100652,-0.272727,0.761905,1,0.203557,0.318592,0.213863,0.434037,...,0,0,0,0,0,1,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,0.078014,-0.906250,0.114644,-0.333333,0.555556,1,0.089879,0.420355,0.032813,0.309138,...,0,0,0,0,0,0,0.0,1.0,0.0,0.0
779,0.105664,0.105263,-0.402886,-0.272727,0.696970,1,0.141945,0.422794,0.138581,0.485417,...,0,0,0,0,0,1,0.0,0.0,1.0,0.0
780,0.218293,-0.257143,0.484055,0.200000,0.673469,1,0.345041,0.405775,0.127045,0.399143,...,0,0,0,0,0,1,1.0,0.0,0.0,0.0
781,0.096284,-0.536232,0.192951,-0.500000,0.777778,1,0.000000,0.177206,0.264362,0.402587,...,0,0,0,0,0,0,1.0,0.0,0.0,0.0


In [21]:
train_labels

Unnamed: 0,EPSBeats
0,0
1,1
2,1
3,0
4,1
...,...
778,0
779,1
780,1
781,1


In [22]:
test.shape, test_labels.shape

((196, 30), (196, 1))

In [23]:
train.columns #300 features selected, customer_ID and target

Index(['CoefficientVariationofStockPrice',
       'PercentageChangefromprevreportedEPS', 'PriceChange',
       'QuestionsSentiment', 'ConferenceSentiment', 'Has_QnA',
       'Reddit_Sentiment', 'RedditSD', 'TwitterSentiment', 'TwitterSD',
       'SocialMediaCovariance', 'TwitterSkew', 'TwitterKurtosis', 'RedditSkew',
       'RedditKurtosis', 'EPSEstimate', 'Has_Reddit', 'Communication Services',
       'Consumer Cyclical', 'Consumer Defensive', 'Energy',
       'Financial Services', 'Healthcare', 'Industrials', 'Real Estate',
       'Technology', 'quarter_1', 'quarter_2', 'quarter_3', 'quarter_4'],
      dtype='object')

In [24]:
test.columns #300 features selected, customer_ID

Index(['CoefficientVariationofStockPrice',
       'PercentageChangefromprevreportedEPS', 'PriceChange',
       'QuestionsSentiment', 'ConferenceSentiment', 'Has_QnA',
       'Reddit_Sentiment', 'RedditSD', 'TwitterSentiment', 'TwitterSD',
       'SocialMediaCovariance', 'TwitterSkew', 'TwitterKurtosis', 'RedditSkew',
       'RedditKurtosis', 'EPSEstimate', 'Has_Reddit', 'Communication Services',
       'Consumer Cyclical', 'Consumer Defensive', 'Energy',
       'Financial Services', 'Healthcare', 'Industrials', 'Real Estate',
       'Technology', 'quarter_1', 'quarter_2', 'quarter_3', 'quarter_4'],
      dtype='object')

In [5]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.combine import SMOTEENN

In [6]:
from sklearn.svm import SVC

In [7]:
smote = SMOTE(random_state = 1, sampling_strategy=1.0)
enn = EditedNearestNeighbours(n_neighbors=5, sampling_strategy = 'majority')

In [8]:
# Pipeline to deal with imbalanced dataset
model_svc = Pipeline([('smt', smote),('enn', enn),('SVC',SVC(max_iter = 1000))])

In [17]:
model_svc_without_balancing = Pipeline([('SVC',SVC(max_iter = 1000, random_state = 42, kernel= 'poly'))])

In [10]:
model_svc

In [34]:
train.columns

Index(['CoefficientVariationofStockPrice',
       'PercentageChangefromprevreportedEPS', 'PriceChange',
       'QuestionsSentiment', 'ConferenceSentiment', 'Has_QnA',
       'Reddit_Sentiment', 'RedditSD', 'TwitterSentiment', 'TwitterSD',
       'SocialMediaCovariance', 'TwitterSkew', 'TwitterKurtosis', 'RedditSkew',
       'RedditKurtosis', 'EPSEstimate', 'Has_Reddit', 'Communication Services',
       'Consumer Cyclical', 'Consumer Defensive', 'Energy',
       'Financial Services', 'Healthcare', 'Industrials', 'Real Estate',
       'Technology', 'quarter_1', 'quarter_2', 'quarter_3', 'quarter_4'],
      dtype='object')

In [11]:
def param_grid_search(train, model, train_labels = train_labels):

    # Step 1.Create parameter space
    print('param_grid_search')
    features = train.columns.tolist() # obtain features
    
    # add more possible parameters and values
    parameter_space = {'SVC__C': [int(x) for x in np.linspace(start = 0.01, stop = 1000, num = 50)]}
    
    # Step 2: Hyperparameter Tuning
    print("Tuning hyper-parameters for mse")
    grid = GridSearchCV(model, parameter_space, cv=10, scoring="accuracy",refit = True)
    print(train[features].shape, train_labels.shape)
    grid.fit(train[features].values, train_labels.values)
    
    # Step 3.Show results
    print("best_params_:")
    print(grid.best_params_)
    means = grid.cv_results_["mean_test_score"]
    stds = grid.cv_results_["std_test_score"]
    
    for mean, std, params in zip(means, stds, grid.cv_results_["params"]):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    return grid

In [55]:
grid = param_grid_search(train, model = model_svc)

param_grid_search
Tuning hyper-parameters for mse
(783, 30) (783, 1)
best_params_:
{'SVC__C': 632}
nan (+/-nan) for {'SVC__C': 0}
0.691 (+/-0.053) for {'SVC__C': 20}
0.684 (+/-0.049) for {'SVC__C': 40}
0.686 (+/-0.052) for {'SVC__C': 61}
0.686 (+/-0.051) for {'SVC__C': 81}
0.683 (+/-0.049) for {'SVC__C': 102}
0.681 (+/-0.053) for {'SVC__C': 122}
0.678 (+/-0.044) for {'SVC__C': 142}
0.678 (+/-0.055) for {'SVC__C': 163}
0.683 (+/-0.053) for {'SVC__C': 183}
0.681 (+/-0.048) for {'SVC__C': 204}
0.677 (+/-0.044) for {'SVC__C': 224}
0.674 (+/-0.051) for {'SVC__C': 244}
0.677 (+/-0.053) for {'SVC__C': 265}
0.681 (+/-0.053) for {'SVC__C': 285}
0.676 (+/-0.059) for {'SVC__C': 306}
0.690 (+/-0.043) for {'SVC__C': 326}
0.682 (+/-0.048) for {'SVC__C': 346}
0.679 (+/-0.059) for {'SVC__C': 367}
0.685 (+/-0.045) for {'SVC__C': 387}
0.686 (+/-0.054) for {'SVC__C': 408}
0.686 (+/-0.057) for {'SVC__C': 428}
0.687 (+/-0.062) for {'SVC__C': 448}
0.693 (+/-0.038) for {'SVC__C': 469}
0.690 (+/-0.044) for {'

In [18]:
grid_without_balancing = param_grid_search(train, model = model_svc_without_balancing)

param_grid_search
Tuning hyper-parameters for mse
(783, 30) (783, 1)
best_params_:
{'SVC__C': 20}
nan (+/-nan) for {'SVC__C': 0}
0.676 (+/-0.065) for {'SVC__C': 20}
0.667 (+/-0.049) for {'SVC__C': 40}
0.667 (+/-0.044) for {'SVC__C': 61}
0.632 (+/-0.192) for {'SVC__C': 81}
0.670 (+/-0.052) for {'SVC__C': 102}
0.646 (+/-0.110) for {'SVC__C': 122}
0.659 (+/-0.063) for {'SVC__C': 142}
0.664 (+/-0.052) for {'SVC__C': 163}
0.670 (+/-0.041) for {'SVC__C': 183}
0.615 (+/-0.211) for {'SVC__C': 204}
0.626 (+/-0.215) for {'SVC__C': 224}
0.630 (+/-0.206) for {'SVC__C': 244}
0.641 (+/-0.112) for {'SVC__C': 265}
0.665 (+/-0.048) for {'SVC__C': 285}
0.669 (+/-0.055) for {'SVC__C': 306}
0.623 (+/-0.214) for {'SVC__C': 326}
0.609 (+/-0.215) for {'SVC__C': 346}
0.663 (+/-0.057) for {'SVC__C': 367}
0.628 (+/-0.219) for {'SVC__C': 387}
0.595 (+/-0.225) for {'SVC__C': 408}
0.557 (+/-0.298) for {'SVC__C': 428}
0.623 (+/-0.209) for {'SVC__C': 448}
0.662 (+/-0.052) for {'SVC__C': 469}
0.653 (+/-0.056) for {'S

In [19]:
grid_without_balancing.best_estimator_

In [20]:
grid_without_balancing.best_score_

0.6756410256410257

In [21]:
# Train score for accuracy, f1, roc_auc 
predictions = grid_without_balancing.best_estimator_.predict(train)
print(accuracy_score(train_labels, predictions))
print(roc_auc_score(train_labels, predictions))
print(f1_score(train_labels, predictions))

0.7662835249042146
0.6523284313725489
0.8496302382908791


In [22]:
predictions = grid_without_balancing.best_estimator_.predict(test)
print(accuracy_score(test_labels, predictions))
print(roc_auc_score(test_labels, predictions))
print(f1_score(test_labels, predictions))

0.7397959183673469
0.6055871212121212
0.8370607028753994


In [None]:
svc_pred = pd.DataFrame(predictions, columns = ['predictions'])

In [None]:
svc_pred.to_csv('model_svc_predictions.csv', index = False)