In [1]:
# Data visualization
import matplotlib.pyplot as plt

# Data manipulation
import pandas as pd
import numpy as np

import glob, re, os, sys, random
from random import shuffle
import random
from time import time

# Vector representations and embeddings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim

# Modeling - Logistic, XGBOOST, SVM
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, roc_auc_score, auc, roc_curve

from sklearn.pipeline import Pipeline, FeatureUnion

from xgboost import XGBClassifier
import pickle


In [2]:
import sys
sys.path.append(f'../../../python')

from balance_split_data import create_label, balance_unique_id, \
create_balanced_excluded, create_train_test_excluded, group_by_case, create_feature_label

from grid_search import gridsearch, fit_best_model_train, evaluate, fit_best_model_test, get_feature_importance_cv

from ml_model import train_model_cross_val, train_model_test, \
get_feature_importance, get_feature_importance10

### Load data

In [3]:
df = pd.read_json(r"../../../../data/processed/pre-processed_merged_2023_04_03.json")

### Balance data and split to train and test set

In [4]:
label_name = "phase2"

In [6]:
df1 = create_label(df, label_name)
df_unique = balance_unique_id(df1)
df_balanced_unique, df_balanced, df_excluded = create_balanced_excluded(df_unique, df1, random_seed=42)
df_train, df_test, df_test1 = create_train_test_excluded(df_balanced, df_balanced_unique, df_excluded, random_state=42)
df_train_grouped, df_test_grouped, df_test1_grouped, df_excluded_grouped = group_by_case(df_train, df_test, df_test1, df_excluded)

Total decisions: 1583
0    1485
1      98
Name: label, dtype: int64
Balancing...
Total decisions: 196
Labels distribution: 
 0    98
1    98
Name: 0, dtype: int64
Training set shape: (157,) (157,)
Test set shape: (40,) (40,)
Creating df_train 1: 78 0: 79
Creating df_test1 1: 20 0: 20
Creating df_test concatenated with df_excluded with len: 1386
Creating df_test 1: 20 0: 1406
Grouping df_train by case_num 1: 78 0: 79
Grouping df_test by case_num 1: 20 0: 1406
Grouping df_test1 by case_num 1: 20 0: 20
Grouping df_excluded by case_num with len: 1386


In [7]:
X_train, y_train, X_test, y_test, X_test1, y_test1, X_test2, y_test2 = create_feature_label(df_train_grouped, df_test_grouped, df_test1_grouped, df_excluded_grouped)

### Hyperparameter tuning with GridSearch Cross Validation

In [7]:
# Define model
model = LinearSVC()

# Create a pipeline with TfidfVectorizer and Model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(analyzer='word', max_features=5000, stop_words='english')),
    ('clf', model)

])

# Define the grid of hyperparameters to search over
parameters = {
    'tfidf__ngram_range': [(1,2),(1,1),(1,3), (2,2),(2,3), (3,3)], # limit to trigrams
    #'tfidf__analyzer': ('word', 'char'),
    #'tfidf__lowercase': (True, False),
    'tfidf__max_df': [0.01, 0.025, 0.05], # (0.01, 1.0), # ignore words that occur as more than x% of corpus
    # 'tfidf__min_df': (1, 2, 3), # we need to see a word at least (once, twice, thrice) in a document
    'tfidf__use_idf': (False, True), # use inverse document frequency weighting
    #'tfidf__sublinear_tf': (False, True),
    'tfidf__binary': (False, True), #set term frequency binary (all non-zero terms are set to 1)
    'tfidf__norm': ('l1', 'l2'), #norm used to normalize term vectors
    # 'tfidf__max_features': (None, 2000, 5000),
    #'tfidf__stop_words': (None, 'english'),

    # 'clf__solver': ['liblinear', 'lbfgs'],
    'clf__C':(0.1, 1, 5) # penalty parameter for the model
}

In [9]:
grid_search, best_parameters = gridsearch(pipeline, parameters, X_train, y_train, cv=5, scoring='f1') #roc_auc

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
done in 6345.555s
Best cross-validation score:  0.9285604375231739
Best parameters set:
	clf__C: 0.1
	tfidf__binary: True
	tfidf__max_df: 0.05
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l2'
	tfidf__use_idf: False


In [8]:
# grid_search, best_parameters = gridsearch(pipeline, parameters, X_train, y_train, cv=3, scoring='f1') #roc_auc

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
done in 5034.406s
Best cross-validation score:  0.9593286580466068
Best parameters set:
	clf__C: 5
	tfidf__binary: True
	tfidf__max_df: 0.05
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l1'
	tfidf__use_idf: False


##### Model fitting and evaluation

In [11]:
pipeline_cv, y_predict_cv = fit_best_model_train(X_train, y_train, model, best_parameters, cv=5)

fitting the best model
Accuracy: 0.9299363057324841

Classification report:
               precision    recall  f1-score   support

           0       0.91      0.95      0.93        79
           1       0.95      0.91      0.93        78

    accuracy                           0.93       157
   macro avg       0.93      0.93      0.93       157
weighted avg       0.93      0.93      0.93       157


CR: (0.930650406504065, 0.9298117494320026, 0.9298907968984695, None)

Confusion matrix:
 [[75  4]
 [ 7 71]] 

_______________________


 Accuracy: 92.994 
 Precision: 0.947 
 Recall: 0.910 
 F1: 0.928 
 FPR: 0.051 
 ROC_AUC: 0.930


In [15]:
evaluate(y_train, y_predict_cv)

Accuracy: 0.9299363057324841

Classification report:
               precision    recall  f1-score   support

           0       0.91      0.95      0.93        79
           1       0.95      0.91      0.93        78

    accuracy                           0.93       157
   macro avg       0.93      0.93      0.93       157
weighted avg       0.93      0.93      0.93       157


CR: (0.930650406504065, 0.9298117494320026, 0.9298907968984695, None)

Confusion matrix:
 [[75  4]
 [ 7 71]] 

_______________________


 Accuracy: 92.994 
 Precision: 0.947 
 Recall: 0.910 
 F1: 0.928 
 FPR: 0.051 
 ROC_AUC: 0.930


Saving the features and coefficients into a dataframe determined by gridsearchCV best_parameters.

In [12]:
df_features_cv = get_feature_importance_cv(pipeline_cv)
df_features_cv.to_excel('../../../../output/tables/features_svm_cv_full_p2.xlsx', index=True)
df_features_cv.head()

Unnamed: 0,feature,importance
491,car light,-0.152551
3414,prescription,-0.145527
621,commercial vehicle,-0.142814
1002,currently controlled,-0.132636
2528,italy united,0.129468


In [16]:
y_predict_cv_test = fit_best_model_test(X_test, y_test, pipeline_cv)

testing on test set
Accuracy: 0.9417952314165497

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97      1406
           1       0.16      0.75      0.27        20

    accuracy                           0.94      1426
   macro avg       0.58      0.85      0.62      1426
weighted avg       0.98      0.94      0.96      1426


CR: (0.5787696924231058, 0.8472617354196301, 0.6175918476803433, None)

Confusion matrix:
 [[1328   78]
 [   5   15]] 

_______________________


 Accuracy: 94.180 
 Precision: 0.161 
 Recall: 0.750 
 F1: 0.265 
 FPR: 0.055 
 ROC_AUC: 0.847


In [19]:
evaluate(y_test, y_predict_cv_test)

Accuracy: 0.9417952314165497

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97      1406
           1       0.16      0.75      0.27        20

    accuracy                           0.94      1426
   macro avg       0.58      0.85      0.62      1426
weighted avg       0.98      0.94      0.96      1426


CR: (0.5787696924231058, 0.8472617354196301, 0.6175918476803433, None)

Confusion matrix:
 [[1328   78]
 [   5   15]] 

_______________________


 Accuracy: 94.180 
 Precision: 0.161 
 Recall: 0.750 
 F1: 0.265 
 FPR: 0.055 
 ROC_AUC: 0.847


In [20]:
y_predict_cv_test1 = fit_best_model_test(X_test1, y_test1, pipeline_cv)

testing on test set
Accuracy: 0.8

Classification report:
               precision    recall  f1-score   support

           0       0.77      0.85      0.81        20
           1       0.83      0.75      0.79        20

    accuracy                           0.80        40
   macro avg       0.80      0.80      0.80        40
weighted avg       0.80      0.80      0.80        40


CR: (0.803030303030303, 0.8, 0.7994987468671679, None)

Confusion matrix:
 [[17  3]
 [ 5 15]] 

_______________________


 Accuracy: 80.000 
 Precision: 0.833 
 Recall: 0.750 
 F1: 0.789 
 FPR: 0.150 
 ROC_AUC: 0.800
