# Install and Import Required Libraries

## Install

In [None]:
!pip install hazm
!pip install stanfordnlp
!pip install -U nltk
!pip install pandas
!pip install sklearn
!pip install numpy

Collecting hazm
[?25l  Downloading https://files.pythonhosted.org/packages/22/13/5a7074bc11d20dbbb46239349ac3f85f7edc148b4cf68e9b8c2f8263830c/hazm-0.7.0-py3-none-any.whl (316kB)
[K     |████████████████████████████████| 317kB 2.8MB/s 
[?25hCollecting libwapiti>=0.2.1; platform_system != "Windows"
[?25l  Downloading https://files.pythonhosted.org/packages/bc/0f/1c9b49bb49821b5856a64ea6fac8d96a619b9f291d1f06999ea98a32c89c/libwapiti-0.2.1.tar.gz (233kB)
[K     |████████████████████████████████| 235kB 8.7MB/s 
[?25hCollecting nltk==3.3
[?25l  Downloading https://files.pythonhosted.org/packages/50/09/3b1755d528ad9156ee7243d52aa5cd2b809ef053a0f31b53d92853dd653a/nltk-3.3.0.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 12.5MB/s 
Building wheels for collected packages: libwapiti, nltk
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp36-cp36m-linux_x86_64.whl size=154256 sha256=0ccfa12f99ccb005d08483680

## Import

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
import stanfordnlp
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest

  import pandas.util.testing as tm


# Import my class from google drive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

FeatureExtractor = drive.CreateFile({'id':'1IwEfIW-lYHvGSSr6TwMsGX3xX1Z829cP'})
FeatureExtractor.GetContentFile('psfeatureextractor.py')
from psfeatureextractor import PSFeatureExtractor as FeatureExtractor

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Common Functions and Variables

In [None]:
stanford_models_path  = '/content/drive/My Drive/Stance Detection Paper/persian_stance_baseline_data/' 
dataset_path = '/content/drive/My Drive/Stance Detection Paper/HeadlineToClaim.csv'
stopWord_path = '/content/drive/My Drive/ImportantNLPFiles/StopWords_fa.txt'
polarity_dataset_path = '/content/drive/My Drive/Stance Detection Paper/PolarityDataset.xlsx'
save_load_path = "/content/drive/My Drive/Stance Detection Paper/persian_stance_baseline_data/vectors"
w2v_model_path = "/content/drive/My Drive/Stance Detection Paper/persian_stance_baseline_data/vectors/w2v_persian.pkl"

In [None]:
def k_fold_train_test(X, Y, k_fold, model, scoring = 'accuracy' , additional_description = ''):
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, X, Y, scoring = scoring, cv= k_fold)
  result = []
  df_result = pd.DataFrame(index=range(k_fold))
  for fold_index, accuracy in enumerate(accuracies):
    result.append((model_name, fold_index, accuracy))
  
  df_result = pd.DataFrame(result, columns=['model_name', 'fold_index', scoring])

  sns.boxplot(x = 'model_name', y = scoring, data = df_result)
  sns.stripplot(x='model_name', y=scoring, data = df_result, 
                size=8, jitter=True, edgecolor="gray", linewidth=2)
  plt.show()

  print('Mean '+scoring + ' of ' + model_name + ' in ' + str(k_fold) + ' fold is: ', np.average(accuracies, axis=0))
  if len(additional_description)>0:
    print(additional_description)
  return df_result

In [None]:
def common_train_test(model, X, Y, test_size= 0.2, additional_description = ''):
  
  model_name = model.__class__.__name__
  # Todo : اول اینکه هر جا سید داریم ثابتش کنیم، و اینکه ترین و تست ست سیو شود و در حالت سیو شده فقط لود شود  
  X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle= True , test_size = test_size, random_state = 0)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  conf_mat = confusion_matrix(y_test, y_pred)
  fig, ax = plt.subplots(figsize=(4,4))
  labels_name = np.unique(Y)
  sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels = labels_name, yticklabels = labels_name)
  plt.ylabel('Actual')
  plt.xlabel('Predicted')
  print(model_name)
  plt.show()

  print(metrics.classification_report(y_test, y_pred, labels_name))
  print('accuracy : ', accuracy_score(y_test, y_pred))
  print('weighted f1 score : ', f1_score(y_test, y_pred, average='weighted'))
  if len(additional_description)>0:
    print(additional_description)
  return y_pred

In [None]:
def grid_search_train_test(all_grid_params, X, Y, test_size= 0.2, additional_description = '', randomize = False
                           , n_iter_search = 100):
  
  model_name = all_grid_params["estimator"].__class__.__name__
  X_train, X_test, y_train, y_test = train_test_split(X, Y, shuffle= True , test_size = test_size, random_state = 0)

  grid_result =  None
  if randomize:
    print("Randomized grid search started.")
    clf = RandomizedSearchCV(estimator = all_grid_params["estimator"], param_distributions = all_grid_params["param_grid"]
                      , scoring = all_grid_params["scoring"], cv = all_grid_params["cv"]
                      , verbose = all_grid_params["verbose"], n_jobs = all_grid_params["n_jobs"], n_iter=n_iter_search )
    grid_result = clf.fit(X_train, y_train)

    print('The best parameters for ' + model_name + ' is: ')
    print(grid_result.best_estimator_)
  else:
    print("Full grid search started.")
    clf = GridSearchCV(estimator = all_grid_params["estimator"], param_grid = all_grid_params["param_grid"]
                      , scoring = all_grid_params["scoring"], cv = all_grid_params["cv"]
                      , verbose = all_grid_params["verbose"], n_jobs = all_grid_params["n_jobs"])
    grid_result = clf.fit(X_train, y_train)

    print('The best parameters for ' + model_name + ' is: ')
    print(grid_result.best_estimator_)

  if len(additional_description)>0:
    print(additional_description)
  return ''

In [None]:
def get_best_models(all_features, all_labels, features_name, use_svm= True, use_random_forest= True
                    , use_linear_svc= True, use_logistic_regression= True, use_GussianNB= True, k_fold = 10
                    , randomize = False, n_iter_search = 100):
  # estimator: estimator object you created
  # params_grid: the dictionary object that holds the hyperparameters you want to try
  # scoring: evaluation metric that you want to use, you can simply pass a valid string/ object of evaluation metric
  # cv: number of cross-validation you have to try for each selected set of hyperparameters
  # verbose: you can set it to 1 to get the detailed print out while you fit the data to GridSearchCV
  # n_jobs: number of processes you wish to run in parallel for this task if it -1 it will use all available processors.
  additional_description = 'With the features : '+ features_name
  if use_svm:
    all_grid_params = {
      "estimator": SVC(),
      "param_grid": {'kernel':('poly', 'rbf', 'sigmoid'), 'C':[1, 10], "degree":[3,5]
                    , "class_weight":('dict','balanced','None'), "decision_function_shape": ('ovo', 'ovr') },
      "scoring": "f1_weighted",
      "cv" : k_fold,
      "verbose": 1,
      "n_jobs" : -1
    }

    try:
      result = grid_search_train_test(all_grid_params = all_grid_params, X= all_features, Y= all_labels, test_size = 0.2
                              , additional_description = additional_description, randomize = randomize, n_iter_search=n_iter_search)
    except:
      print('An error accured when using grid_search_train_test for SVC model.')
  
  if use_random_forest:
    all_grid_params = {
      "estimator": RandomForestClassifier(),
      "param_grid": {'criterion':('gini','entropy'), 'n_estimators':[50, 75, 100, 125, 150, 175, 200]
                    , "class_weight":('dict','balanced','balanced_subsample','None')},
      "scoring": "f1_weighted",
      "cv" : k_fold,
      "verbose": 1,
      "n_jobs" : -1
    }

    try:
      result = grid_search_train_test(all_grid_params = all_grid_params, X= all_features, Y= all_labels, test_size = 0.2
                              , additional_description = additional_description, randomize = randomize, n_iter_search=n_iter_search)
    except:
      print('An error accured when using grid_search_train_test for RandomForestClassifier model.')

  if use_linear_svc:
    all_grid_params = {
      "estimator": LinearSVC(),
      "param_grid": {'penalty':('l1','l2'), 'C':[0.5, 1, 1.5, 2, 2.5], "multi_class":("ovr","crammer_singer")
                    , "loss":('hinge','squared_hinge'), "class_weight":("dict", "balanced", "None")
                    ,"max_iter":[1000, 1200]},
      "scoring": "f1_weighted",
      "cv" : k_fold,
      "verbose": 1,
      "n_jobs" : -1
    }

    try:
      result = grid_search_train_test(all_grid_params = all_grid_params, X= all_features, Y= all_labels, test_size = 0.2
                              , additional_description = additional_description, randomize = randomize, n_iter_search=n_iter_search)
    except:
      print('An error accured when using grid_search_train_test for LinearSVC model.')

  if use_logistic_regression:
    all_grid_params = {
      "estimator": LogisticRegression(),
      "param_grid": {'penalty':('l1','l2'), 'C':[0.5, 0.75 ,1]
                    , "solver":('newton-cg', 'sag', 'saga')
                    , "class_weight":["balanced"]
                    ,"max_iter":[1000, 1200], "multi_class":('ovr', 'multinomial')
                    },
      "scoring": "f1_weighted",
      "cv" : k_fold,
      "verbose": 1,
      "n_jobs" : -1
    }

    try:
      result = grid_search_train_test(all_grid_params = all_grid_params, X= all_features, Y= all_labels, test_size = 0.2
                              , additional_description = additional_description, randomize = randomize, n_iter_search=n_iter_search)
    except Exception as error:
      print('An error accured when using grid_search_train_test for LogisticRegression model.')
      print(error.args)

  if use_GussianNB:
    all_grid_params = {
      "estimator": GaussianNB(),
      "param_grid": {},
      "scoring": "f1_weighted",
      "cv" : k_fold,
      "verbose": 1,
      "n_jobs" : -1
    }

    try:
      result = grid_search_train_test(all_grid_params = all_grid_params, X= all_features, Y= all_labels, test_size = 0.2
                              , additional_description = additional_description, randomize = randomize, n_iter_search=n_iter_search)
    except:
      print('An error accured when using grid_search_train_test for GaussianNB model.')  

## Create Feature Extractor Object

In [None]:
psf_extractor = FeatureExtractor(dataset_path = dataset_path, stopWord_path = stopWord_path
                                    , polarity_dataset_path = polarity_dataset_path,
                                  stanford_models_path = stanford_models_path
                                  ,use_google_drive = True, important_words = ['؟',
             'تکذیب',
             'تکذیب شد',
             ':',
             ])

tokens_claims , tokens_headlines = psf_extractor.nltk_tokenize()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/ImportantNLPFiles/StopWords_fa.txt
(2029,) (2029,) (2029,) (2029,) (2029,)


In [None]:
labels = np.reshape(psf_extractor.labels,(len(psf_extractor.labels),1))
labels.shape

(2029, 1)

In [None]:
metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

# TF-IDF With Other Features

## Extract Feature

In [None]:
features, features_name = psf_extractor.generate_Features(w2v_model_path = w2v_model_path,save_path = save_load_path
                                                          , save_feature= True
                                                          , load_path= save_load_path
                                                          , load_if_exist = True, bow = False, w2v = False, polarity= False)

features.shape

Features loaded successfully.


(2029, 728)

## Grid Search

In [None]:
get_best_models(features, labels, features_name,use_logistic_regression = False, use_GussianNB = False)

The best parameters for SVC is: 
SVC(C=10, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
With the features : tfidf_similarity_important_words_more_than2_parts_root_distance_
The best parameters for RandomForestClassifier is: 
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=75, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)
With the f

In [None]:
get_best_models(features, labels, features_name, use_svm= False, use_random_forest= False
                    , use_linear_svc= False,use_GussianNB = False, k_fold=5)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 200 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 450 tasks      | elapsed: 43.0min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 46.9min finished


The best parameters for LogisticRegression is: 
LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='ovr', n_jobs=None, penalty='l1',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)
With the features : tfidf_similarity_important_words_more_than2_parts_root_distance_


In [None]:
get_best_models(features, labels, features_name, use_svm= False, use_random_forest= False
                    , use_linear_svc= False,use_logistic_regression = False)

The best parameters for GaussianNB is: 
GaussianNB(priors=None, var_smoothing=1e-09)
With the features : tfidf_similarity_important_words_more_than2_parts_root_distance_


# BOW With Other Features

## Extract Feature

In [None]:
features, features_name = psf_extractor.generate_Features(w2v_model_path = w2v_model_path,save_path = save_load_path
                                                          , save_feature= True
                                                          , load_path= save_load_path
                                                          , load_if_exist = True, tfidf = False, w2v = False, polarity= False)

features.shape

Features loaded successfully.


(2029, 20360)

## Grid Search

In [None]:
get_best_models(features, labels, features_name,use_logistic_regression = False, use_GussianNB = False)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 67.0min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 105.2min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 207.9min finished


The best parameters for SVC is: 
SVC(C=10, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
With the features : similarity_important_words_more_than2_parts_root_distance_bow_
Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 44.7min
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed: 44.9min finished


The best parameters for RandomForestClassifier is: 
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=75, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)
With the features : similarity_important_words_more_than2_parts_root_distance_bow_
Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   52.0s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed: 19.3min finished


The best parameters for LinearSVC is: 
LinearSVC(C=1, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1200,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
With the features : similarity_important_words_more_than2_parts_root_distance_bow_


In [None]:
get_best_models(features, labels, features_name, use_svm= False, use_random_forest= False
                    , use_linear_svc= False,use_GussianNB = False, k_fold=5,randomize = True, n_iter_search = 10)

Randomized grid search started.
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 382.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 398.3min finished


The best parameters for LogisticRegression is: 
LogisticRegression(C=0.75, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=1000, multi_class='multinomial', n_jobs=None,
                   penalty='l1', random_state=None, solver='saga', tol=0.0001,
                   verbose=0, warm_start=False)
With the features : similarity_important_words_is_question_more_than2_parts_root_distance_bow_


In [None]:


get_best_models(features, labels, features_name, use_svm= False, use_random_forest= False
                    , use_linear_svc= False,use_logistic_regression = False)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.9s finished


The best parameters for GaussianNB is: 
GaussianNB(priors=None, var_smoothing=1e-09)
With the features : similarity_important_words_more_than2_parts_root_distance_bow_


# W2V With Other Features

## Extract Feature

In [None]:
features, features_name = psf_extractor.generate_Features(w2v_model_path = w2v_model_path,save_path = save_load_path
                                                          , save_feature= True
                                                          , load_path= save_load_path
                                                          , load_if_exist = True, tfidf = False, bow = False, polarity= False)

features.shape

Features vector file is not exist.
Start to generate similarity feature
End of similarity feature
Start to generate important words feature
End of important words feature
"is question" feature was added.
"more than tow parts" feature was added.
Start to generate root distance feature
Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/content/drive/My Drive/Stance Detection Paper/persian_stance_baseline_data/fa_seraji_models/fa_seraji_tokenizer.pt', 'lang': 'fa', 'shorthand': 'fa_seraji', 'mode': 'predict'}
---
Loading: mwt
With settings: 
{'model_path': '/content/drive/My Drive/Stance Detection Paper/persian_stance_baseline_data/fa_seraji_models/fa_seraji_mwt_expander.pt', 'lang': 'fa', 'shorthand': 'fa_seraji', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
---
Loading: pos
With settings: 
{'model_path': '/content/drive/My Drive/Stance Detection Paper/persian_stance_baselin

(2029, 310)

## Grid Search

In [None]:
get_best_models(features, labels, features_name,use_logistic_regression = False, use_GussianNB = False)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 243 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  5.2min finished


The best parameters for SVC is: 
SVC(C=10, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
With the features : similarity_important_words_is_question_more_than2_parts_root_distance_w2v_
Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 424 tasks      | elapsed: 15.2min
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed: 15.3min finished


The best parameters for RandomForestClassifier is: 
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=175, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)
With the features : similarity_important_words_is_question_more_than2_parts_root_distance_w2v_
Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 305 tasks      | elapsed: 29.6min
[Parallel(n_jobs=-1)]: Done 851 tasks      | elapsed: 68.6min
[Parallel(n_jobs=-1)]: Done 2230 tasks      | elapsed: 184.9min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed: 190.1min finished


The best parameters for LinearSVC is: 
LinearSVC(C=1, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1200,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
With the features : similarity_important_words_is_question_more_than2_parts_root_distance_w2v_


In [None]:
get_best_models(features, labels, features_name, use_svm= False, use_random_forest= False
                    , use_linear_svc= False,use_GussianNB = True, k_fold=5)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 212 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 462 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 20.2min finished


The best parameters for LogisticRegression is: 
LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)
With the features : similarity_important_words_is_question_more_than2_parts_root_distance_w2v_
Fitting 5 folds for each of 1 candidates, totalling 5 fits
The best parameters for GaussianNB is: 
GaussianNB(priors=None, var_smoothing=1e-09)
With the features : similarity_important_words_is_question_more_than2_parts_root_distance_w2v_


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


In [None]:
all_grid_params = {
  "estimator": SVC(),
  "param_grid": {'kernel':('poly', 'rbf', 'sigmoid'), 'C':[1, 10], "degree":[3,5]
                , "class_weight":('dict','balanced','None'), "decision_function_shape": ('ovo', 'ovr') },
  "scoring": "f1_weighted",
  "cv" : 10,
  "verbose": 1,
  "n_jobs" : -1
}
# RandomizedSearchCV
try:
  result = grid_search_train_test(all_grid_params = all_grid_params, X= features, Y= labels, test_size = 0.2
                          , additional_description = 'ssdd', search_type = "random")
except:
  print('An error accured when using grid_search_train_test for SVC model.')

An error accured when using grid_search_train_test for SVC model.


# W2V With TfIdf and Other Features

## Extract Feature

In [None]:
features, features_name = psf_extractor.generate_Features(w2v_model_path = w2v_model_path,save_path = save_load_path
                                                          , save_feature= True
                                                          , load_path= save_load_path
                                                          , load_if_exist = True, bow = False, polarity= False)

features.shape

Features loaded successfully.


(2029, 1028)

## Grid Search

In [None]:
get_best_models(features, labels, features_name,use_logistic_regression = False, use_GussianNB = False)

Fitting 10 folds for each of 72 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 243 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:  8.0min finished


The best parameters for SVC is: 
SVC(C=10, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
With the features : tfidf_similarity_important_words_is_question_more_than2_parts_root_distance_w2v_
Fitting 10 folds for each of 56 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 390 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed: 10.1min finished


The best parameters for RandomForestClassifier is: 
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=125,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
With the features : tfidf_similarity_important_words_is_question_more_than2_parts_root_distance_w2v_
Fitting 10 folds for each of 240 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done 764 tasks      | elapsed: 74.6min
[Parallel(n_jobs=-1)]: Done 1687 tasks      | elapsed: 251.9min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed: 453.6min finished


The best parameters for LinearSVC is: 
LinearSVC(C=0.5, class_weight='balanced', dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)
With the features : tfidf_similarity_important_words_is_question_more_than2_parts_root_distance_w2v_


In [None]:
get_best_models(features, labels, features_name, use_svm= False, use_random_forest= False
                    , use_linear_svc= False,use_GussianNB = True, k_fold=5)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  62 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 212 tasks      | elapsed: 28.2min
[Parallel(n_jobs=-1)]: Done 462 tasks      | elapsed: 69.0min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 71.0min finished


The best parameters for LogisticRegression is: 
LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='ovr', n_jobs=None, penalty='l1',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)
With the features : tfidf_similarity_important_words_is_question_more_than2_parts_root_distance_w2v_
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


The best parameters for GaussianNB is: 
GaussianNB(priors=None, var_smoothing=1e-09)
With the features : tfidf_similarity_important_words_is_question_more_than2_parts_root_distance_w2v_
