In [None]:
#This notebook shows an example code to save the confidence values of your model as a numpy array
#As an example, we use Logistic Regression with TFIDF

In [None]:
!pip install sklearn
import numpy as np
import pandas as pd 
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

In [None]:
#specify the required info here and run the notebook to receive value analysis result for your model
modelName = 'name_of_your_model'
resPath = 'define_the_path_for_results'
data_folder = 'define_the_path_where_you_keep_the_confidence_values_of_your_model_and_the_datasets'
dataToTrain = 'name_of_training_set.csv'
dataToVal = 'name_of_validation_set.csv'
dataToTest ='name_of_test_set.csv'
ground_truth_column = 'specify_the_column_for_ground_truth_in_your_csv_files'
txt = 'specify_the_column_for_text_in_your_csv_files'
datasetName = 'name_of_your_dataset'

In [None]:
confidences_train = '{}_train_{}_conf.npy'.format(datasetName,modelName)
confidences_val = '{}_val_{}_conf.npy'.format(datasetName,modelName)
confidences_test = '{}_test_{}_conf.npy'.format(datasetName,modelName)

model = LogisticRegression()

In [None]:
# Read data
train = pd.read_csv(data_folder + dataToTrain)
train = train.dropna()
train = train.reset_index(drop=True)
val = pd.read_csv(data_folder + dataToVal)
val = val.dropna()
val = val.reset_index(drop=True)
test = pd.read_csv(data_folder + dataToTest)
test = test.dropna()
test = test.reset_index(drop=True)

# Vectorize the text
tfidf = TfidfVectorizer(min_df=0, max_features = maxTfIdfFeat, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                ngram_range=(1, 1), use_idf=1, smooth_idf=1, sublinear_tf=1,
                stop_words='english', lowercase=False)

def prepare_features(tfidf, X_train, setN):
    # compute tfidf features
    if setN == 'train':
        X_train_tfidf = tfidf.fit_transform(X_train).toarray()
    else:
        X_train_tfidf = tfidf.transform(X_train).toarray()
    return X_train_tfidf

#load the data features
X_train = prepare_features(tfidf, train[txt].tolist(), 'train')
y_train = train[ground_truth_column].values

X_val = prepare_features(tfidf, val[txt].tolist(), 'val')
y_val = val[ground_truth_column].values

X_test = prepare_features(tfidf, test[txt].tolist(), 'test')
y_test = test[ground_truth_column].values

In [None]:
model.fit(X_train, y_train)

In [None]:
conf_train = model.predict_proba(X_train)
conf_val = model.predict_proba(X_train)
conf_test = model.predict_proba(X_train)

np.save(data_folder + confidences_train, conf_train)
np.save(data_folder + confidences_val, conf_val)
np.save(data_folder + confidences_test, conf_test)