# **Stock market news feed semantic analysis** *(Baseline Classifiers)*

https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
https://colab.research.google.com/drive/1RkME2dS0imkikmKsO_lgnuozuriA65V9#scrollTo=Qk-qyHFInIOl&uniqifier=2

* https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier

* https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier

* https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC

* https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessClassifier.html#sklearn.gaussian_processGaussianProcessClassifier

* https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.kernels.RBF.html#sklearn.gaussian_process.kernels.RBF

* https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier

* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier

* https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB

* https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html#sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# classfiers import
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

# others
import numpy as np
import pandas as pd
import pandas_datareader as web
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('punkt')
from nltk.tokenize import word_tokenize  
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# Shuffle cycle number for the dataframe
SHUFFLE_CYCLE = 500

# Random seed
RANDOM_SEED = 1234

# Numpy random seed
NP_SEED = 1234

# Max iteration for training
MAX_ITER = 1000

# Train size
TRAIN_SPLIT = 0.75

# Test size
TEST_SPLIT = 0.25

np.random.seed(NP_SEED)

## **Preproces**

In [4]:
# Load and prerpocess just like in logistic regression baseline

# Copy the dataset to the local environment
!cp "/content/drive/MyDrive/Combined_News_DJIA.csv" "Combined_News_DJIA.csv"

# Number of merged news into one string
ROWS = 8

# Load the dataset 
df_combined = pd.read_csv('Combined_News_DJIA.csv', index_col = "Date")

# Load the stock data
df_stock = web.DataReader("DJIA", data_source="yahoo", start="2008-08-08", 
                          end="2016-07-01")

temp_day = []

for day in range(len(df_stock)):
    temp_day.append(df_stock.index[day].date())

df_stock.index = temp_day

difference = []

if len(df_combined) == len(df_stock):
    print("The lengths are the same!")

for day in range(max(len(df_combined), len(df_stock))):
    if str(df_combined.index[day]) != str(df_stock.index[day]):
        print("There is difference at: " + str(day) + " index")
        print("News: " + str(df_combined.index[day]) + "\tStock: " + str(df_stock.index[day]))
        difference.append(day)

if len(difference) is 0:
    print("The dates matched!")

difference = []

for day in range(len(df_stock)):
    # label should be 1 -> rise
    if int(df_stock["Adj Close"][day]) >= int(df_stock["Adj Close"][day - 1]):
        if df_combined["Label"][day] != 1:
            difference.append(str(df_stock.index[day]))
            print("Problem at day " + str(df_stock.index[day]))
            print("Today: " + str(df_stock["Adj Close"][day]) +"\t\tYesterday: " + str(df_stock["Adj Close"][day - 1]) + "\t\tLabel: " + str(df_combined["Label"][day]) + "\n")

    # label should be 0 -> fall
    if int(df_stock["Adj Close"][day]) < int(df_stock["Adj Close"][day - 1]):
        if df_combined["Label"][day] != 0:
            difference.append(str(df_stock.index[day]))
            print("Problem at day " + str(df_stock.index[day]))
            print("Today: " + str(df_stock["Adj Close"][day]) +"\t\tYesterday: " + str(df_stock["Adj Close"][day - 1]) + "\t\tLabel: " + str(df_combined["Label"][day]) + "\n")

print("All differences: " + str(len(difference)))  

# correct the wrong labels
for row in difference:
    if df_combined.loc[row, "Label"] == 0:
        df_combined.loc[row, "Label"] = 1
    else:
        df_combined.loc[row, "Label"] = 0

# check them
for row in difference:
    print(str(row) + "\t\t" + str(df_combined.loc[row, "Label"]))

# Find the cells with NaN and after the rows for them
is_NaN = df_combined.isnull()
row_has_NaN = is_NaN.any(axis = 1)
rows_with_NaN = df_combined[row_has_NaN]

# Replace them
df_combined = df_combined.replace(np.nan, " ")

# Check the process
is_NaN = df_combined.isnull()
row_has_NaN = is_NaN.any(axis = 1)
rows_with_NaN = df_combined[row_has_NaN]

assert len(rows_with_NaN) is 0

# Get column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)

# 2D array creation for the news based on macros
COLUMNS = len(df_combined)
news_sum = [[0 for i in range(COLUMNS)] for j in range(int((len(combined_column_names) - 1) / ROWS))]  

# Show the column names
print("Column names of the dataset:") 
print(combined_column_names)

# Merge the news
for row in range(len(df_combined)):
  for column in range(int((len(combined_column_names) - 1) / ROWS)):
    temp = ""
    news = ""
    for word in range(ROWS):
      news = df_combined[combined_column_names[(column * ROWS) + (word + 1)]][row]
      # Remove the b character at the begining of the string
      if news[0] is "b":
        news = " " + news[1:]
      temp = temp + " " + news
    news_sum[column][row] = temp

# Show the first day second package of the news
print("\nThe first day second package of the news:")
print(news_sum[0][0])

# Drop the old columns
for column in range(len(combined_column_names) - 1):
  df_combined.drop(combined_column_names[column + 1], axis = 1, inplace = True)

# Create the new columns with the merged news
for column in range(int((len(combined_column_names) - 1) / ROWS)):
  colum_name = "News_" + str(column + 1)
  df_combined[colum_name] = news_sum[column]

# The label column 
LABEL_COLUMN = 0

news_sum = []
label_sum = []

# Get the column names
combined_column_names = []
for column in df_combined.columns:
  combined_column_names.append(column)

# Write out the column names 
print(combined_column_names)
print("\n")

# Connect the merged news with the labels
for column in range(len(df_combined)):
  for row in range(len(combined_column_names) - 1):
    news_sum.append(df_combined[combined_column_names[row + 1]][column])
    label_sum.append(df_combined[combined_column_names[LABEL_COLUMN]][column])

# Create the new DataFrame
df_sum_news_labels = pd.DataFrame(data = label_sum, index = None, columns = ["Label"])
df_sum_news_labels["News"] = news_sum  

# Removing punctuations
temp_news = []
for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if word not in string.punctuation:
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

news_sum = temp_news
temp_news = []

# Remove numbers
for line in news_sum:
  temp_attach = ""
  for word in line:
    temp = " "
    if not word.isdigit():
      temp = word
    temp_attach = temp_attach + "".join(temp)
  temp_news.append(temp_attach)

# Remove space
for line in range(len(temp_news)):    
  temp_news[line] = " ".join(temp_news[line].split())

# Converting headlines to lower case
for line in range(len(temp_news)): 
    temp_news[line] = temp_news[line].lower()

# Update the data frame
df_sum_news_labels["News"] = temp_news

# Load the stop words
stop_words = set(stopwords.words('english'))

filtered_sentence = []
news_sum = df_sum_news_labels["News"]

# Remove stop words
for line in news_sum:
  word_tokens = word_tokenize(line)
  temp_attach = ""
  for word in word_tokens:
    temp = " "
    if not word in stop_words:
      temp = temp + word
    temp_attach = temp_attach + "".join(temp)
  filtered_sentence.append(temp_attach)

# Remove space
for line in range(len(filtered_sentence)):    
  filtered_sentence[line] = " ".join(filtered_sentence[line].split())

# Update the data frame
df_sum_news_labels["News"] = filtered_sentence

news_sum = df_sum_news_labels["News"]
null_indexes = []
index = 0

for line in news_sum:
  if line is "":
    null_indexes.append(index)
  index = index + 1

print(null_indexes)

for row in null_indexes:
  df_sum_news_labels = df_sum_news_labels.drop(row)

news_sum = df_sum_news_labels["News"]
null_indexes = []
index = 0

for line in news_sum:
  if line is "":
    null_indexes.append(index)
  index = index + 1
  
assert len(null_indexes) is 0

# Do the shuffle
for i in range(SHUFFLE_CYCLE):
  df_sum_news_labels = shuffle(df_sum_news_labels, random_state = RANDOM_SEED)

# Reset the index
df_sum_news_labels.reset_index(inplace=True, drop=True)

# Show the data frame
print(df_sum_news_labels.head())

INPUT_SIZE = len(df_sum_news_labels)
TRAIN_SIZE = int(TRAIN_SPLIT * INPUT_SIZE) 
TEST_SIZE = int(TEST_SPLIT * INPUT_SIZE)

# Split the dataset
train = df_sum_news_labels[:TRAIN_SIZE] 
test = df_sum_news_labels[TRAIN_SIZE:]

# Print out the length
print("Train data set length: " + str(len(train)))
print("Test data set length: " + str(len(test)))
print("Split summa: " + str(len(train) + len(test)))
print("Dataset summa before split: " + str(len(df_sum_news_labels)))

# check
split_sum = len(train) + len(test)
sum = len(df_sum_news_labels)
assert split_sum == sum

print(train.tail(1))

print(test.head(1))

The lengths are the same!
The dates matched!
Problem at day 2010-10-14
Today: 11096.919921875		Yesterday: 11096.080078125		Label: 0

Problem at day 2012-11-12
Today: 12815.080078125		Yesterday: 12815.3896484375		Label: 0

Problem at day 2012-11-15
Today: 12570.9501953125		Yesterday: 12570.9501953125		Label: 0

Problem at day 2013-04-12
Today: 14865.0595703125		Yesterday: 14865.1396484375		Label: 0

Problem at day 2014-04-24
Today: 16501.650390625		Yesterday: 16501.650390625		Label: 0

Problem at day 2015-08-12
Today: 17402.509765625		Yesterday: 17402.83984375		Label: 0

Problem at day 2015-11-27
Today: 17813.390625		Yesterday: 17813.390625		Label: 0

All differences: 7
2010-10-14		1
2012-11-12		1
2012-11-15		1
2013-04-12		1
2014-04-24		1
2015-08-12		1
2015-11-27		1
Column names of the dataset:
['Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top2

In [5]:
train["News"][0]

'air france aircraft carrying people disappeared radar atlantic ocean brazil tell difference israeli palestinian former german mp judge offers reward prosecution bush cheney rumsfeld blair north korea starts landing exercises using amphibious vessels may planning attack south korean island president el salvador sends son france escape violence native el salvador son stabbed death awl parisian bridge random act violence apparent motive indonesian model married malaysian prince says kidnapped drugged sexually abused royal family escapes help singaporean police attack liberty untold story israel deadly assault u spy ship book review el salvador first leftist president takes power hillary clinton attended inauguration'

In [6]:
train_X = train["News"]
train_X = train_X.to_numpy()
train_Y = train["Label"]
train_Y = train_Y.to_numpy()

test_X = test["News"]
test_X = test_X.to_numpy()
test_Y = test["Label"]
test_Y = test_Y.to_numpy()

In [7]:
train_X[0]

'air france aircraft carrying people disappeared radar atlantic ocean brazil tell difference israeli palestinian former german mp judge offers reward prosecution bush cheney rumsfeld blair north korea starts landing exercises using amphibious vessels may planning attack south korean island president el salvador sends son france escape violence native el salvador son stabbed death awl parisian bridge random act violence apparent motive indonesian model married malaysian prince says kidnapped drugged sexually abused royal family escapes help singaporean police attack liberty untold story israel deadly assault u spy ship book review el salvador first leftist president takes power hillary clinton attended inauguration'

In [8]:
train_Y[0]

1

In [9]:
test_X[0]

'friend honduras sent email videos pictures military firing peaceful protesters uploaded eyewitness account everyone see please let disappear one else reporting watch israeli tv footage settlers attacking r npeace activists tv crew iran people hanged less week girls tricked us sex tarot card iranian reformists planning strike please let know watching fewer us killed america stops paying attention regime loses restraint mir hossein moussavi mehdi karroubi amp mohammad khatami joint statement iranian opposition leaders criticized describe security state asking arrested released official internet cut xinjiang prevent riot spreading'

In [10]:
test_Y[0]

0

## **Classifiers try out**

In [11]:
names = ["Logistic Regression", "Nearest Neighbors", "Linear SVC", 
         "Linear SVC 2", "SVC", "Decision Tree", "Random Forest", "Neural Net"]

classifiers = [
    LogisticRegression(random_state=RANDOM_SEED, max_iter=MAX_ITER),
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=RANDOM_SEED),
    LinearSVC(random_state=RANDOM_SEED),
    SVC(gamma=2, C=1, random_state=RANDOM_SEED),
    DecisionTreeClassifier(max_depth=10, random_state=RANDOM_SEED),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=RANDOM_SEED),
    MLPClassifier(alpha=1, max_iter=10, random_state=RANDOM_SEED)]

In [12]:
text_clf_all = []

for classifier in classifiers:
    text_clf_temp = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                        ('tfidf', TfidfTransformer()),
                        ('clf', classifier),
    ])
    text_clf_all.append(text_clf_temp)

In [13]:
score_array = []

# iterate over classifiers
for name, text_clf in zip(names, text_clf_all):
    print(name)
    text_clf = text_clf.fit(train_X, train_Y)

    prediction = text_clf.predict(test_X)

    print(classification_report(test_Y, prediction))
    print(accuracy_score(test_Y, prediction))

    print("------------------------------------------------------------------")

    score = text_clf.score(test_X, test_Y)
    score_array.append(score)

Logistic Regression
              precision    recall  f1-score   support

           0       0.45      0.28      0.34       632
           1       0.59      0.75      0.66       860

    accuracy                           0.55      1492
   macro avg       0.52      0.51      0.50      1492
weighted avg       0.53      0.55      0.52      1492

0.5495978552278821
------------------------------------------------------------------
Nearest Neighbors
              precision    recall  f1-score   support

           0       0.48      0.53      0.50       632
           1       0.62      0.58      0.60       860

    accuracy                           0.56      1492
   macro avg       0.55      0.55      0.55      1492
weighted avg       0.56      0.56      0.56      1492

0.5563002680965148
------------------------------------------------------------------
Linear SVC


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       632
           1       0.58      1.00      0.73       860

    accuracy                           0.58      1492
   macro avg       0.29      0.50      0.37      1492
weighted avg       0.33      0.58      0.42      1492

0.5764075067024129
------------------------------------------------------------------
Linear SVC 2
              precision    recall  f1-score   support

           0       0.44      0.43      0.43       632
           1       0.59      0.59      0.59       860

    accuracy                           0.52      1492
   macro avg       0.51      0.51      0.51      1492
weighted avg       0.52      0.52      0.52      1492

0.5227882037533512
------------------------------------------------------------------
SVC
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       632
           1       0.58      1.00      0.73     

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       632
           1       0.58      1.00      0.73       860

    accuracy                           0.58      1492
   macro avg       0.29      0.50      0.37      1492
weighted avg       0.33      0.58      0.42      1492

0.5764075067024129
------------------------------------------------------------------


## **Classifiers grid search** (commented out, saved to onenote)

In [14]:
#text_clf_all[0]

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
  

In [None]:
# try out with logistic regression
#parameters_svm = {
#                  'vect__ngram_range': [(1,1), (1,2), (1,3), (2,3), (3,3)],
#                  'tfidf__use_idf': (True, False),
#                  'tfidf__norm': ('l1', 'l2'),     
#                  'clf__penalty': ('l1', 'l2', 'elasticnet', 'none'),   
#                  'clf__C': np.linspace(start=0, stop=10, num=101), 
#                  'clf__solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
#                  'clf__l1_ratio': np.linspace(0, 1, 101)
#}

#keys = parameters_svm.keys()

#parameters_array = []

#for key in keys:
#      dict_temp = {key:parameters_svm[key]}
#      parameters_array.append(dict_temp)

#best_params_array = []
#df_cv_results_array = []
#score_array = []

#for parameter_svm in parameters_array:
#    gs_clf = GridSearchCV(text_clf_all[0], parameter_svm, n_jobs=-1, verbose=4, cv=3)
#    gs_clf = gs_clf.fit(train_X, train_Y)

#    print("")
#    print(gs_clf.best_score_)
#    print("")
#    print(gs_clf.best_params_)
#    best_params_array.append(gs_clf.best_params_)

#    df_cv_results_array.append(pd.DataFrame(gs_clf.cv_results_))

#    prediction = gs_clf.predict(test_X)

#    print(classification_report(test_Y, prediction))
#    print(accuracy_score(test_Y, prediction))
#    score_array.append(accuracy_score(test_Y, prediction))

#    print("------------------------------------------------------------------")

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   36.5s finished



0.5356440100238008

{'vect__ngram_range': (2, 3)}
              precision    recall  f1-score   support

           0       0.38      0.06      0.10       632
           1       0.57      0.93      0.71       860

    accuracy                           0.56      1492
   macro avg       0.47      0.49      0.40      1492
weighted avg       0.49      0.56      0.45      1492

0.5603217158176944
------------------------------------------------------------------
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   16.0s finished



0.5325162173517722

{'tfidf__use_idf': True}
              precision    recall  f1-score   support

           0       0.45      0.28      0.34       632
           1       0.59      0.75      0.66       860

    accuracy                           0.55      1492
   macro avg       0.52      0.51      0.50      1492
weighted avg       0.53      0.55      0.52      1492

0.5495978552278821
------------------------------------------------------------------
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   13.4s finished



0.5325162173517722

{'tfidf__norm': 'l2'}
              precision    recall  f1-score   support

           0       0.45      0.28      0.34       632
           1       0.59      0.75      0.66       860

    accuracy                           0.55      1492
   macro avg       0.52      0.51      0.50      1492
weighted avg       0.53      0.55      0.52      1492

0.5495978552278821
------------------------------------------------------------------
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   23.0s finished



0.5325162173517722

{'clf__penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.45      0.28      0.34       632
           1       0.59      0.75      0.66       860

    accuracy                           0.55      1492
   macro avg       0.52      0.51      0.50      1492
weighted avg       0.53      0.55      0.52      1492

0.5495978552278821
------------------------------------------------------------------
Fitting 3 folds for each of 101 candidates, totalling 303 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   42.9s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 303 out of 303 | elapsed: 13.6min finished



0.5363144011522217

{'clf__C': 0.8}
              precision    recall  f1-score   support

           0       0.44      0.22      0.29       632
           1       0.58      0.79      0.67       860

    accuracy                           0.55      1492
   macro avg       0.51      0.51      0.48      1492
weighted avg       0.52      0.55      0.51      1492

0.5495978552278821
------------------------------------------------------------------
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   29.6s finished



0.5325162173517722

{'clf__solver': 'newton-cg'}
              precision    recall  f1-score   support

           0       0.45      0.28      0.34       632
           1       0.59      0.75      0.66       860

    accuracy                           0.55      1492
   macro avg       0.52      0.51      0.50      1492
weighted avg       0.53      0.55      0.52      1492

0.5495978552278821
------------------------------------------------------------------
Fitting 3 folds for each of 101 candidates, totalling 303 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 303 out of 303 | elapsed: 12.2min finished
  "(penalty={})".format(self.penalty))



0.5325162173517722

{'clf__l1_ratio': 0.0}
              precision    recall  f1-score   support

           0       0.45      0.28      0.34       632
           1       0.59      0.75      0.66       860

    accuracy                           0.55      1492
   macro avg       0.52      0.51      0.50      1492
weighted avg       0.53      0.55      0.52      1492

0.5495978552278821
------------------------------------------------------------------


In [None]:
#best_params_array

[{'vect__ngram_range': (2, 3)},
 {'tfidf__use_idf': True},
 {'tfidf__norm': 'l2'},
 {'clf__penalty': 'l2'},
 {'clf__C': 0.8},
 {'clf__solver': 'newton-cg'},
 {'clf__l1_ratio': 0.0}]

In [None]:
#test_clf = Pipeline([('vect', CountVectorizer(ngram_range=(2,3))),
#                    ('tfidf', TfidfTransformer(use_idf=True, norm='l2')),
#                    ('clf', LogisticRegression(penalty='l2', C=0.8, solver='newton-cg',
#                                               l1_ratio=0, random_state=RANDOM_SEED, max_iter=MAX_ITER),),
#])

#test_clf = test_clf.fit(train_X, train_Y)

#prediction = test_clf.predict(test_X)

#print(classification_report(test_Y, prediction))
#print(accuracy_score(test_Y, prediction))

  "(penalty={})".format(self.penalty))


              precision    recall  f1-score   support

           0       0.38      0.03      0.06       632
           1       0.57      0.96      0.72       860

    accuracy                           0.57      1492
   macro avg       0.48      0.50      0.39      1492
weighted avg       0.49      0.57      0.44      1492

0.5670241286863271


## **Vote classifier**

In [22]:
names = ["Logistic Regression", "Nearest Neighbors", 
         "Linear SVC 2"]

classifiers = [
    LogisticRegression(random_state=RANDOM_SEED, max_iter=MAX_ITER),
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=10, random_state=RANDOM_SEED)
    ]

In [23]:
# soft vs hard
estimators_tuple = []
for name, classifier in zip(names, classifiers):
    estimators_tuple.append((name, classifier))

estimators_tuple = tuple(estimators_tuple)  

voting_array = ['hard', 'soft']
eclf_array = []
text_eclf_array = []

for index in range(2):
    eclf = VotingClassifier(estimators=estimators_tuple, voting=voting_array[index])
    eclf_array.append(eclf)
    text_eclf_array.append(Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                              ('tfidf', TfidfTransformer()),
                              ('clf', eclf),
    ]))

score_vote_array = []

for index in range(2):
    text_eclf_array[index] = text_eclf_array[index].fit(train_X, train_Y)

    prediction = text_eclf_array[index].predict(test_X)

    print(classification_report(test_Y, prediction))
    print(accuracy_score(test_Y, prediction))
    print("------------------------------------------------------------------")   

    score_vote_array.append(accuracy_score(test_Y, prediction))

              precision    recall  f1-score   support

           0       0.45      0.23      0.30       632
           1       0.58      0.79      0.67       860

    accuracy                           0.55      1492
   macro avg       0.51      0.51      0.49      1492
weighted avg       0.53      0.55      0.52      1492

0.5529490616621984
------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.47      0.49      0.48       632
           1       0.61      0.59      0.60       860

    accuracy                           0.55      1492
   macro avg       0.54      0.54      0.54      1492
weighted avg       0.55      0.55      0.55      1492

0.546916890080429
------------------------------------------------------------------


In [29]:
# play with weights
# soft
text_eclf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                          ('tfidf', TfidfTransformer()),
                          ('clf', VotingClassifier(estimators=estimators_tuple, voting='soft', weights=None)),
])

# 0, 0.5, 1.0, 1.5, 2.0 -> 5*5*5 = 125
weight_element = [0, 0.5, 1.0, 1.5, 2.0]

weight_element = [0, 0.5, 1.0, 1.5, 2.0]
weight_elemet_array = []

for i in range(len(weight_element)):
    for j in range(len(weight_element)):
        for k in range(len(weight_element)):
            if weight_element[i] == weight_element[j] == 0:
                pass
            elif weight_element[i] == weight_element[k] == 0:
                pass
            elif weight_element[j] == weight_element[k] == 0:
                pass
            else:             
                weight_elemet_array.append((weight_element[i],weight_element[j],weight_element[k]),)

weight_elemet_array

parameters_svm = {
                  'clf__weights': weight_elemet_array,
}

best_params = []
df_cv_results = []
score = []

gs_clf = GridSearchCV(text_eclf, parameters_svm, n_jobs=-1, verbose=4, cv=3)
gs_clf = gs_clf.fit(train_X, train_Y)

print("")
print(gs_clf.best_score_)
print("")
print(gs_clf.best_params_)
best_params = gs_clf.best_params_

df_cv_results = pd.DataFrame(gs_clf.cv_results_)

prediction = gs_clf.predict(test_X)

print(classification_report(test_Y, prediction))
print(accuracy_score(test_Y, prediction))
score = accuracy_score(test_Y, prediction)

Fitting 3 folds for each of 112 candidates, totalling 336 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 336 out of 336 | elapsed: 22.0min finished



0.5472634736030121

{'clf__weights': (2.0, 0.5, 0)}
              precision    recall  f1-score   support

           0       0.47      0.41      0.44       632
           1       0.60      0.66      0.63       860

    accuracy                           0.55      1492
   macro avg       0.54      0.53      0.53      1492
weighted avg       0.55      0.55      0.55      1492

0.5536193029490617


In [30]:
# play with weights
# hard
text_eclf= Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                          ('tfidf', TfidfTransformer()),
                          ('clf', VotingClassifier(estimators=estimators_tuple, voting='hard', weights=None)),
])

# 0, 0.5, 1.0, 1.5, 2.0 -> 5*5*5 = 125
weight_element = [0, 0.5, 1.0, 1.5, 2.0]

weight_element = [0, 0.5, 1.0, 1.5, 2.0]
weight_elemet_array = []

for i in range(len(weight_element)):
    for j in range(len(weight_element)):
        for k in range(len(weight_element)):
            if weight_element[i] == weight_element[j] == 0:
                pass
            elif weight_element[i] == weight_element[k] == 0:
                pass
            elif weight_element[j] == weight_element[k] == 0:
                pass
            else:             
                weight_elemet_array.append((weight_element[i],weight_element[j],weight_element[k]),)

weight_elemet_array

parameters_svm = {
                  'clf__weights': weight_elemet_array,
}

best_params = []
df_cv_results = []
score = []

gs_clf = GridSearchCV(text_eclf, parameters_svm, n_jobs=-1, verbose=4, cv=3)
gs_clf = gs_clf.fit(train_X, train_Y)

print("")
print(gs_clf.best_score_)
print("")
print(gs_clf.best_params_)
best_params = gs_clf.best_params_

df_cv_results = pd.DataFrame(gs_clf.cv_results_)

prediction = gs_clf.predict(test_X)

print(classification_report(test_Y, prediction))
print(accuracy_score(test_Y, prediction))
score = accuracy_score(test_Y, prediction)

Fitting 3 folds for each of 112 candidates, totalling 336 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 336 out of 336 | elapsed: 22.1min finished



0.5436873549908326

{'clf__weights': (0, 1.0, 0.5)}
              precision    recall  f1-score   support

           0       0.48      0.53      0.50       632
           1       0.62      0.58      0.60       860

    accuracy                           0.56      1492
   macro avg       0.55      0.55      0.55      1492
weighted avg       0.56      0.56      0.56      1492

0.5563002680965148
