In [1]:
# Data visualization
import matplotlib.pyplot as plt

# Data manipulation
import pandas as pd
import numpy as np

import glob, re, os, sys, random
from random import shuffle
import random
from time import time

# Vector representations and embeddings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Modeling - Logistic, XGBOOST, SVM
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.pipeline import Pipeline, FeatureUnion
import pickle

from gensim.parsing.preprocessing import preprocess_string, remove_stopwords


In [2]:
import sys
sys.path.append(f'../../src/python')

from balance_split_data import create_label, balance_unique_id, \
create_balanced_excluded, create_train_test_excluded, group_by_case, create_feature_label

from grid_search import gridsearch, fit_best_model_train, evaluate, fit_best_model_test, get_feature_importance_cv

from ml_model import train_model_cross_val, train_model_test, \
get_feature_importance, get_feature_importance10

### Load data

In [3]:
df = pd.read_json(r"../../data/processed/pre-processed_merged_2023_04_04.json")

In [4]:
# lowercase
df['text_clean'] = df['text_clean'].str.lower()

In [5]:
# # remove stopwords
# preprocess_function = [lambda x: x, remove_stopwords]
# df['text_clean'] = df['text_clean'].apply(lambda x: " ".join(preprocess_string(str(x), preprocess_function)))

### Balance data and split to train and test set

In [6]:
label_name = "wc"
random_seed=42
random_state=42

In [7]:
df1 = create_label(df, label_name)
df_unique = balance_unique_id(df1)
df_balanced_unique, df_balanced, df_excluded = create_balanced_excluded(df_unique, df1, random_seed=random_seed)
df_train, df_test, df_test1 = create_train_test_excluded(df_balanced, df_balanced_unique, df_excluded, random_state=random_state)
df_train_grouped, df_test_grouped, df_test1_grouped, df_excluded_grouped = group_by_case(df_train, df_test, df_test1, df_excluded, text='text_clean')

Total decisions: 1574
0    1313
1     261
Name: label, dtype: int64
Balancing...
Total decisions: 522
Labels distribution: 
 0    261
1    261
Name: 0, dtype: int64
Training set shape: (417,) (417,)
Test set shape: (105,) (105,)
Creating df_train 1: 208 0: 209
Creating df_test1 1: 53 0: 52
Creating df_test concatenated with df_excluded with len: 1052
Creating df_test 1: 53 0: 1104
Grouping df_train by case_num 1: 208 0: 209
Grouping df_test by case_num 1: 53 0: 1104
Grouping df_test1 by case_num 1: 53 0: 52
Grouping df_excluded by case_num with len: 1052


In [8]:
X_train, y_train, X_test, y_test, X_test1, y_test1, X_test2, y_test2 = create_feature_label(df_train_grouped, df_test_grouped, df_test1_grouped, df_excluded_grouped, text='text_clean')

### Hyperparameter tuning with GridSearch Cross Validation

In [9]:
# Define model
model = LinearSVC()

# Create a pipeline with TfidfVectorizer and Model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', max_features=5000, stop_words='english')),
    ('clf', model)

])

# Define the grid of hyperparameters to search over
parameters = {
    'tfidf__ngram_range': [(1,2),(1,1),(1,3), (2,2),(2,3), (3,3)], # limit to trigrams
    #'tfidf__analyzer': ('word', 'char'),
    # 'tfidf__lowercase': (True, False),
    'tfidf__max_df': [0.01, 0.025, 0.05], # (0.01, 1.0), # ignore words that occur as more than x% of corpus
    # 'tfidf__min_df': (1, 2, 3), # we need to see a word at least (once, twice, thrice) in a document
    'tfidf__use_idf': (False, True), # use inverse document frequency weighting
    #'tfidf__sublinear_tf': (False, True),
    'tfidf__binary': (False, True), #set term frequency binary (all non-zero terms are set to 1)
    'tfidf__norm': ('l1', 'l2'), #norm used to normalize term vectors
    # 'tfidf__max_features': (None, 2000, 5000),
    # 'tfidf__stop_words': (None, 'english'),

    # 'clf__solver': ['liblinear', 'lbfgs'],
    'clf__C':(0.1, 1, 5) # penalty parameter for the model
}

In [10]:
grid_search, best_parameters = gridsearch(pipeline, parameters, X_train, y_train, cv=5, scoring='f1')

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
done in 5291.554s
Best cross-validation score:  0.7479811961660147
Best parameters set:
	clf__C: 0.1
	tfidf__binary: True
	tfidf__max_df: 0.05
	tfidf__ngram_range: (1, 3)
	tfidf__norm: 'l2'
	tfidf__use_idf: True


In [None]:
# Fitting 5 folds for each of 432 candidates, totalling 2160 fits
# done in 5291.554s
# Best cross-validation score:  0.7479811961660147
# Best parameters set:
# 	clf__C: 0.1
# 	tfidf__binary: True
# 	tfidf__max_df: 0.05
# 	tfidf__ngram_range: (1, 3)
# 	tfidf__norm: 'l2'
# 	tfidf__use_idf: True

In [22]:
# grid_search, best_parameters = gridsearch(pipeline, parameters, X_train, y_train, cv=5, scoring='f1')

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
done in 5628.176s
Best cross-validation score:  0.7765947755745944
Best parameters set:
	clf__C: 1
	tfidf__binary: True
	tfidf__max_df: 0.025
	tfidf__ngram_range: (2, 2)
	tfidf__norm: 'l1'
	tfidf__use_idf: False


In [None]:
# Fitting 5 folds for each of 432 candidates, totalling 2160 fits
# done in 5628.176s
# Best cross-validation score:  0.7765947755745944
# Best parameters set:
# 	clf__C: 1
# 	tfidf__binary: True
# 	tfidf__max_df: 0.025
# 	tfidf__ngram_range: (2, 2)
# 	tfidf__norm: 'l1'
# 	tfidf__use_idf: False

##### Model fitting and evaluation

In [11]:
pipeline_cv, y_predict_cv = fit_best_model_train(X_train, y_train, model, best_parameters, cv=5)

fitting the best model
Accuracy: 0.7464114832535885

Classification report:
               precision    recall  f1-score   support

           0       0.75      0.75      0.75       209
           1       0.75      0.75      0.75       209

    accuracy                           0.75       418
   macro avg       0.75      0.75      0.75       418
weighted avg       0.75      0.75      0.75       418


CR: (0.7464114832535885, 0.7464114832535885, 0.7464114832535886, None)

Confusion matrix:
 [[156  53]
 [ 53 156]] 

_______________________


Recall: 0.746 
 Precision: 0.746 
 F1: 0.746 
 FPR: 0.254 
 Accuracy: 74.641 
 ROC_AUC: 0.746


Saving the features and coefficients into a dataframe determined by gridsearchCV best_parameters.

In [13]:
df_features_cv = get_feature_importance_cv(pipeline_cv)
df_features_cv.to_excel('../../output/tables/features_svm_cv_full_wc_2.xlsx', index=True)
df_features_cv.head()

Unnamed: 0,feature,importance
81,active pharmaceutical,0.191774
3961,remedy proposed,0.189685
672,community dimension relevant,-0.186865
1410,doubt identified,0.186813
487,case relevant geographic,-0.17337


In [None]:
y_predict_cv_test = fit_best_model_test(X_test, y_test, pipeline_cv)

testing on test set
Accuracy: 0.5425623387790198

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.53      0.69      1110
           1       0.08      0.91      0.15        53

    accuracy                           0.54      1163
   macro avg       0.54      0.72      0.42      1163
weighted avg       0.95      0.54      0.66      1163


CR: (0.5374874297545105, 0.7154428012918579, 0.4197782329154569, None)

Confusion matrix:
 [[583 527]
 [  5  48]] 

_______________________


 Accuracy: 54.256 
 Precision: 0.083 
 Recall: 0.906 
 F1: 0.153 
 FPR: 0.475 
 ROC_AUC: 0.715


In [None]:
y_predict_cv_test1 = fit_best_model_test(X_test1, y_test1, pipeline_cv)

testing on test set
Accuracy: 0.7142857142857143

Classification report:
               precision    recall  f1-score   support

           0       0.84      0.52      0.64        52
           1       0.66      0.91      0.76        53

    accuracy                           0.71       105
   macro avg       0.75      0.71      0.70       105
weighted avg       0.75      0.71      0.70       105


CR: (0.7506421232876712, 0.71244557329463, 0.7023809523809524, None)

Confusion matrix:
 [[27 25]
 [ 5 48]] 

_______________________


 Accuracy: 71.429 
 Precision: 0.658 
 Recall: 0.906 
 F1: 0.762 
 FPR: 0.481 
 ROC_AUC: 0.712
