In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional
from keras.optimizers import Adam

In [2]:
from data_preparation import *

In [3]:
from feature_extraction_methods import *

In [4]:
from vanilla_models import *

In [5]:
def split_train_test(data, feats, target):
    X = data[feats]
    y = data[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 53)
    
    return X_train, X_test, y_train, y_test

In [6]:
raw_data = pd.read_csv('train_test_data/train.csv')
raw_data.shape

(159571, 8)

In [7]:
target = 'toxic_flag'

In [8]:
raw_data.isnull().sum().sum()

0

In [9]:
raw_data_cleaned = data_preparation(raw_data)

In [10]:
# raw_data_cleaned = pd.read_csv('intermediate_data/undersampled_intermediate_data_final.csv')

In [11]:
print(raw_data_cleaned.shape)
raw_data_cleaned.head()

(30624, 3)


Unnamed: 0,id,comment_text_cleaned,toxic_flag
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0
7,00031b1e95af7921,your vandalism to the matt shirvington article...,0
23,000c6a3f0cd3ba8e,""" the signpost: september read this signpos...",0
27,000ffab30195c5e1,"yes, because the mother of the child in the ca...",0
48,001cadfd324f8087,""" as for your claims of """"stalking"""", that is ...",0


In [12]:
# data_use, feats_use = universal_sentence_encoder_func(raw_data_cleaned)

In [13]:
data_use = pd.read_csv("intermediate_data/universal_sentence_encoder_features.csv")

In [14]:
feats_use = ['feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10', 'feat_11', 'feat_12', 'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29', 'feat_30', 'feat_31', 'feat_32', 'feat_33', 'feat_34', 'feat_35', 'feat_36', 'feat_37', 'feat_38', 'feat_39', 'feat_40', 'feat_41', 'feat_42', 'feat_43', 'feat_44', 'feat_45', 'feat_46', 'feat_47', 'feat_48', 'feat_49', 'feat_50', 'feat_51', 'feat_52', 'feat_53', 'feat_54', 'feat_55', 'feat_56', 'feat_57', 'feat_58', 'feat_59', 'feat_60', 'feat_61', 'feat_62', 'feat_63', 'feat_64', 'feat_65', 'feat_66', 'feat_67', 'feat_68', 'feat_69', 'feat_70', 'feat_71', 'feat_72', 'feat_73', 'feat_74', 'feat_75', 'feat_76', 'feat_77', 'feat_78', 'feat_79', 'feat_80', 'feat_81', 'feat_82', 'feat_83', 'feat_84', 'feat_85', 'feat_86', 'feat_87', 'feat_88', 'feat_89', 'feat_90', 'feat_91', 'feat_92', 'feat_93', 'feat_94', 'feat_95', 'feat_96', 'feat_97', 'feat_98', 'feat_99', 'feat_100', 'feat_101', 'feat_102', 'feat_103', 'feat_104', 'feat_105', 'feat_106', 'feat_107', 'feat_108', 'feat_109', 'feat_110', 'feat_111', 'feat_112', 'feat_113', 'feat_114', 'feat_115', 'feat_116', 'feat_117', 'feat_118', 'feat_119', 'feat_120', 'feat_121', 'feat_122', 'feat_123', 'feat_124', 'feat_125', 'feat_126', 'feat_127', 'feat_128', 'feat_129', 'feat_130', 'feat_131', 'feat_132', 'feat_133', 'feat_134', 'feat_135', 'feat_136', 'feat_137', 'feat_138', 'feat_139', 'feat_140', 'feat_141', 'feat_142', 'feat_143', 'feat_144', 'feat_145', 'feat_146', 'feat_147', 'feat_148', 'feat_149', 'feat_150', 'feat_151', 'feat_152', 'feat_153', 'feat_154', 'feat_155', 'feat_156', 'feat_157', 'feat_158', 'feat_159', 'feat_160', 'feat_161', 'feat_162', 'feat_163', 'feat_164', 'feat_165', 'feat_166', 'feat_167', 'feat_168', 'feat_169', 'feat_170', 'feat_171', 'feat_172', 'feat_173', 'feat_174', 'feat_175', 'feat_176', 'feat_177', 'feat_178', 'feat_179', 'feat_180', 'feat_181', 'feat_182', 'feat_183', 'feat_184', 'feat_185', 'feat_186', 'feat_187', 'feat_188', 'feat_189', 'feat_190', 'feat_191', 'feat_192', 'feat_193', 'feat_194', 'feat_195', 'feat_196', 'feat_197', 'feat_198', 'feat_199', 'feat_200', 'feat_201', 'feat_202', 'feat_203', 'feat_204', 'feat_205', 'feat_206', 'feat_207', 'feat_208', 'feat_209', 'feat_210', 'feat_211', 'feat_212', 'feat_213', 'feat_214', 'feat_215', 'feat_216', 'feat_217', 'feat_218', 'feat_219', 'feat_220', 'feat_221', 'feat_222', 'feat_223', 'feat_224', 'feat_225', 'feat_226', 'feat_227', 'feat_228', 'feat_229', 'feat_230', 'feat_231', 'feat_232', 'feat_233', 'feat_234', 'feat_235', 'feat_236', 'feat_237', 'feat_238', 'feat_239', 'feat_240', 'feat_241', 'feat_242', 'feat_243', 'feat_244', 'feat_245', 'feat_246', 'feat_247', 'feat_248', 'feat_249', 'feat_250', 'feat_251', 'feat_252', 'feat_253', 'feat_254', 'feat_255', 'feat_256', 'feat_257', 'feat_258', 'feat_259', 'feat_260', 'feat_261', 'feat_262', 'feat_263', 'feat_264', 'feat_265', 'feat_266', 'feat_267', 'feat_268', 'feat_269', 'feat_270', 'feat_271', 'feat_272', 'feat_273', 'feat_274', 'feat_275', 'feat_276', 'feat_277', 'feat_278', 'feat_279', 'feat_280', 'feat_281', 'feat_282', 'feat_283', 'feat_284', 'feat_285', 'feat_286', 'feat_287', 'feat_288', 'feat_289', 'feat_290', 'feat_291', 'feat_292', 'feat_293', 'feat_294', 'feat_295', 'feat_296', 'feat_297', 'feat_298', 'feat_299', 'feat_300', 'feat_301', 'feat_302', 'feat_303', 'feat_304', 'feat_305', 'feat_306', 'feat_307', 'feat_308', 'feat_309', 'feat_310', 'feat_311', 'feat_312', 'feat_313', 'feat_314', 'feat_315', 'feat_316', 'feat_317', 'feat_318', 'feat_319', 'feat_320', 'feat_321', 'feat_322', 'feat_323', 'feat_324', 'feat_325', 'feat_326', 'feat_327', 'feat_328', 'feat_329', 'feat_330', 'feat_331', 'feat_332', 'feat_333', 'feat_334', 'feat_335', 'feat_336', 'feat_337', 'feat_338', 'feat_339', 'feat_340', 'feat_341', 'feat_342', 'feat_343', 'feat_344', 'feat_345', 'feat_346', 'feat_347', 'feat_348', 'feat_349', 'feat_350', 'feat_351', 'feat_352', 'feat_353', 'feat_354', 'feat_355', 'feat_356', 'feat_357', 'feat_358', 'feat_359', 'feat_360', 'feat_361', 'feat_362', 'feat_363', 'feat_364', 'feat_365', 'feat_366', 'feat_367', 'feat_368', 'feat_369', 'feat_370', 'feat_371', 'feat_372', 'feat_373', 'feat_374', 'feat_375', 'feat_376', 'feat_377', 'feat_378', 'feat_379', 'feat_380', 'feat_381', 'feat_382', 'feat_383', 'feat_384', 'feat_385', 'feat_386', 'feat_387', 'feat_388', 'feat_389', 'feat_390', 'feat_391', 'feat_392', 'feat_393', 'feat_394', 'feat_395', 'feat_396', 'feat_397', 'feat_398', 'feat_399', 'feat_400', 'feat_401', 'feat_402', 'feat_403', 'feat_404', 'feat_405', 'feat_406', 'feat_407', 'feat_408', 'feat_409', 'feat_410', 'feat_411', 'feat_412', 'feat_413', 'feat_414', 'feat_415', 'feat_416', 'feat_417', 'feat_418', 'feat_419', 'feat_420', 'feat_421', 'feat_422', 'feat_423', 'feat_424', 'feat_425', 'feat_426', 'feat_427', 'feat_428', 'feat_429', 'feat_430', 'feat_431', 'feat_432', 'feat_433', 'feat_434', 'feat_435', 'feat_436', 'feat_437', 'feat_438', 'feat_439', 'feat_440', 'feat_441', 'feat_442', 'feat_443', 'feat_444', 'feat_445', 'feat_446', 'feat_447', 'feat_448', 'feat_449', 'feat_450', 'feat_451', 'feat_452', 'feat_453', 'feat_454', 'feat_455', 'feat_456', 'feat_457', 'feat_458', 'feat_459', 'feat_460', 'feat_461', 'feat_462', 'feat_463', 'feat_464', 'feat_465', 'feat_466', 'feat_467', 'feat_468', 'feat_469', 'feat_470', 'feat_471', 'feat_472', 'feat_473', 'feat_474', 'feat_475', 'feat_476', 'feat_477', 'feat_478', 'feat_479', 'feat_480', 'feat_481', 'feat_482', 'feat_483', 'feat_484', 'feat_485', 'feat_486', 'feat_487', 'feat_488', 'feat_489', 'feat_490', 'feat_491', 'feat_492', 'feat_493', 'feat_494', 'feat_495', 'feat_496', 'feat_497', 'feat_498', 'feat_499', 'feat_500', 'feat_501', 'feat_502', 'feat_503', 'feat_504', 'feat_505', 'feat_506', 'feat_507', 'feat_508', 'feat_509', 'feat_510', 'feat_511', 'feat_512']
len(feats_use)

512

In [15]:
X_train_use, X_test_use, y_train_use, y_test_use = split_train_test(data_use, feats_use, target)
X_train_use.shape, X_test_use.shape, y_train_use.shape, y_test_use.shape

((24472, 512), (6118, 512), (24472,), (6118,))

In [16]:
%%time
result = main_function(X_train_use, X_test_use, y_train_use, y_test_use, 'Universal Sentence Encoder')

svc object created
svc object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 26min 42s


In [17]:
result

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,Universal Sentence Encoder,Vanilla Support Vector Classifier,0.629469,0.646593,0.708372,0.697894,0.666594,0.671265,0.625082,0.633867
0,Universal Sentence Encoder,Vanilla Random Forest Classifier,0.62965,0.646728,0.712465,0.702777,0.668502,0.673589,0.626144,0.635175
0,Universal Sentence Encoder,Vanilla XG Boost Classifier,0.680845,0.701217,0.582252,0.580104,0.627701,0.634937,0.634562,0.642694
0,Universal Sentence Encoder,Basic ANN,0.649451,0.671756,0.621563,0.617638,0.635201,0.643561,0.622262,0.63354
0,Universal Sentence Encoder,Basic LSTM,0.618404,0.632461,0.749459,0.73726,0.677654,0.680851,0.622753,0.629781
0,Universal Sentence Encoder,Basic Bidirectional LSTM,0.691739,0.711462,0.49861,0.494355,0.579507,0.583363,0.617154,0.621772


In [18]:
# result.to_csv("intermediate_data/result_comparison.csv", index = False)

In [19]:
# result = pd.read_csv("intermediate_data/result_comparison.csv")

In [20]:
result_backup_1 = result.copy()

In [21]:
# result_backup

In [22]:
# data_sent_tr, feats_sent_tr = sentence_transformer_func(raw_data_cleaned)

In [23]:
data_sent_tr = pd.read_csv("intermediate_data/sentence_transformer_features.csv")

In [24]:
# data_sent_tr.head()

In [25]:
feats_sent_tr = ['feat_1', 'feat_2', 'feat_3', 'feat_4', 'feat_5', 'feat_6', 'feat_7', 'feat_8', 'feat_9', 'feat_10', 'feat_11', 'feat_12', 'feat_13', 'feat_14', 'feat_15', 'feat_16', 'feat_17', 'feat_18', 'feat_19', 'feat_20', 'feat_21', 'feat_22', 'feat_23', 'feat_24', 'feat_25', 'feat_26', 'feat_27', 'feat_28', 'feat_29', 'feat_30', 'feat_31', 'feat_32', 'feat_33', 'feat_34', 'feat_35', 'feat_36', 'feat_37', 'feat_38', 'feat_39', 'feat_40', 'feat_41', 'feat_42', 'feat_43', 'feat_44', 'feat_45', 'feat_46', 'feat_47', 'feat_48', 'feat_49', 'feat_50', 'feat_51', 'feat_52', 'feat_53', 'feat_54', 'feat_55', 'feat_56', 'feat_57', 'feat_58', 'feat_59', 'feat_60', 'feat_61', 'feat_62', 'feat_63', 'feat_64', 'feat_65', 'feat_66', 'feat_67', 'feat_68', 'feat_69', 'feat_70', 'feat_71', 'feat_72', 'feat_73', 'feat_74', 'feat_75', 'feat_76', 'feat_77', 'feat_78', 'feat_79', 'feat_80', 'feat_81', 'feat_82', 'feat_83', 'feat_84', 'feat_85', 'feat_86', 'feat_87', 'feat_88', 'feat_89', 'feat_90', 'feat_91', 'feat_92', 'feat_93', 'feat_94', 'feat_95', 'feat_96', 'feat_97', 'feat_98', 'feat_99', 'feat_100', 'feat_101', 'feat_102', 'feat_103', 'feat_104', 'feat_105', 'feat_106', 'feat_107', 'feat_108', 'feat_109', 'feat_110', 'feat_111', 'feat_112', 'feat_113', 'feat_114', 'feat_115', 'feat_116', 'feat_117', 'feat_118', 'feat_119', 'feat_120', 'feat_121', 'feat_122', 'feat_123', 'feat_124', 'feat_125', 'feat_126', 'feat_127', 'feat_128', 'feat_129', 'feat_130', 'feat_131', 'feat_132', 'feat_133', 'feat_134', 'feat_135', 'feat_136', 'feat_137', 'feat_138', 'feat_139', 'feat_140', 'feat_141', 'feat_142', 'feat_143', 'feat_144', 'feat_145', 'feat_146', 'feat_147', 'feat_148', 'feat_149', 'feat_150', 'feat_151', 'feat_152', 'feat_153', 'feat_154', 'feat_155', 'feat_156', 'feat_157', 'feat_158', 'feat_159', 'feat_160', 'feat_161', 'feat_162', 'feat_163', 'feat_164', 'feat_165', 'feat_166', 'feat_167', 'feat_168', 'feat_169', 'feat_170', 'feat_171', 'feat_172', 'feat_173', 'feat_174', 'feat_175', 'feat_176', 'feat_177', 'feat_178', 'feat_179', 'feat_180', 'feat_181', 'feat_182', 'feat_183', 'feat_184', 'feat_185', 'feat_186', 'feat_187', 'feat_188', 'feat_189', 'feat_190', 'feat_191', 'feat_192', 'feat_193', 'feat_194', 'feat_195', 'feat_196', 'feat_197', 'feat_198', 'feat_199', 'feat_200', 'feat_201', 'feat_202', 'feat_203', 'feat_204', 'feat_205', 'feat_206', 'feat_207', 'feat_208', 'feat_209', 'feat_210', 'feat_211', 'feat_212', 'feat_213', 'feat_214', 'feat_215', 'feat_216', 'feat_217', 'feat_218', 'feat_219', 'feat_220', 'feat_221', 'feat_222', 'feat_223', 'feat_224', 'feat_225', 'feat_226', 'feat_227', 'feat_228', 'feat_229', 'feat_230', 'feat_231', 'feat_232', 'feat_233', 'feat_234', 'feat_235', 'feat_236', 'feat_237', 'feat_238', 'feat_239', 'feat_240', 'feat_241', 'feat_242', 'feat_243', 'feat_244', 'feat_245', 'feat_246', 'feat_247', 'feat_248', 'feat_249', 'feat_250', 'feat_251', 'feat_252', 'feat_253', 'feat_254', 'feat_255', 'feat_256', 'feat_257', 'feat_258', 'feat_259', 'feat_260', 'feat_261', 'feat_262', 'feat_263', 'feat_264', 'feat_265', 'feat_266', 'feat_267', 'feat_268', 'feat_269', 'feat_270', 'feat_271', 'feat_272', 'feat_273', 'feat_274', 'feat_275', 'feat_276', 'feat_277', 'feat_278', 'feat_279', 'feat_280', 'feat_281', 'feat_282', 'feat_283', 'feat_284', 'feat_285', 'feat_286', 'feat_287', 'feat_288', 'feat_289', 'feat_290', 'feat_291', 'feat_292', 'feat_293', 'feat_294', 'feat_295', 'feat_296', 'feat_297', 'feat_298', 'feat_299', 'feat_300', 'feat_301', 'feat_302', 'feat_303', 'feat_304', 'feat_305', 'feat_306', 'feat_307', 'feat_308', 'feat_309', 'feat_310', 'feat_311', 'feat_312', 'feat_313', 'feat_314', 'feat_315', 'feat_316', 'feat_317', 'feat_318', 'feat_319', 'feat_320', 'feat_321', 'feat_322', 'feat_323', 'feat_324', 'feat_325', 'feat_326', 'feat_327', 'feat_328', 'feat_329', 'feat_330', 'feat_331', 'feat_332', 'feat_333', 'feat_334', 'feat_335', 'feat_336', 'feat_337', 'feat_338', 'feat_339', 'feat_340', 'feat_341', 'feat_342', 'feat_343', 'feat_344', 'feat_345', 'feat_346', 'feat_347', 'feat_348', 'feat_349', 'feat_350', 'feat_351', 'feat_352', 'feat_353', 'feat_354', 'feat_355', 'feat_356', 'feat_357', 'feat_358', 'feat_359', 'feat_360', 'feat_361', 'feat_362', 'feat_363', 'feat_364', 'feat_365', 'feat_366', 'feat_367', 'feat_368', 'feat_369', 'feat_370', 'feat_371', 'feat_372', 'feat_373', 'feat_374', 'feat_375', 'feat_376', 'feat_377', 'feat_378', 'feat_379', 'feat_380', 'feat_381', 'feat_382', 'feat_383', 'feat_384']
len(feats_sent_tr)

384

In [26]:
X_train_sent_tr, X_test_sent_tr, y_train_sent_tr, y_test_sent_tr = split_train_test(data_sent_tr, feats_sent_tr, target)
X_train_sent_tr.shape, X_test_sent_tr.shape, y_train_sent_tr.shape, y_test_sent_tr.shape

((24472, 384), (6118, 384), (24472,), (6118,))

In [27]:
%%time
result = main_function(X_train_sent_tr, X_test_sent_tr, y_train_sent_tr, y_test_sent_tr, 'Sentence Transformer')

svc object created
svc object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 23min 36s


In [28]:
result

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,Sentence Transformer,Vanilla Support Vector Classifier,0.62928,0.647755,0.725363,0.71773,0.673914,0.68095,0.628596,0.639752
0,Sentence Transformer,Vanilla Random Forest Classifier,0.629213,0.648433,0.726599,0.719866,0.674409,0.682285,0.6288,0.640896
0,Sentence Transformer,Vanilla XG Boost Classifier,0.629179,0.64853,0.726753,0.720171,0.674455,0.682475,0.6288,0.641059
0,Sentence Transformer,Basic ANN,0.61869,0.633752,0.763902,0.754043,0.68367,0.688685,0.625981,0.634848
0,Sentence Transformer,Basic LSTM,0.625508,0.639807,0.737102,0.726884,0.676735,0.680571,0.627411,0.634521
0,Sentence Transformer,Basic Bidirectional LSTM,0.623833,0.639452,0.747992,0.740922,0.680294,0.686457,0.628024,0.637463


In [29]:
result_backup_2 = result.copy()

In [30]:
# data_tf_idf, feats_tf_idf = tf_idf_func(raw_data_cleaned)

In [31]:
data_tf_idf = pd.read_csv("intermediate_data/tf_idf_featured_data.csv")

In [32]:
data_tf_idf.head()

Unnamed: 0,about,account,action,actually,add,added,adding,address,admin,administrator,...,wrote,yeah,year,yes,yet,you,yourself,yourselfgo,id,toxic_flag
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0009eaea3325de8c,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,000cfee90f50d471,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00151a9f93c6b059,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00316bcc0d1bc6e0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0038d1dc2ad29469,0


In [33]:
feats_tf_idf = list(data_tf_idf.columns[:-2])
len(feats_tf_idf)

500

In [34]:
X_train_tf_idf, X_test_tf_idf, y_train_tf_idf, y_test_tf_idf = split_train_test(data_tf_idf, feats_tf_idf, target)
X_train_tf_idf.shape, X_test_tf_idf.shape, y_train_tf_idf.shape, y_test_tf_idf.shape

((24472, 500), (6118, 500), (24472,), (6118,))

In [35]:
%%time
result = main_function(X_train_tf_idf, X_test_tf_idf, y_train_tf_idf, y_test_tf_idf, 'TF-IDF')

svc object created
svc object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 27min 40s


In [36]:
result

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,TF-IDF,Vanilla Support Vector Classifier,0.948367,0.883646,0.926321,0.8343,0.937214,0.858264,0.934333,0.852403
0,TF-IDF,Vanilla Random Forest Classifier,0.972876,0.875402,0.988956,0.829722,0.98085,0.85195,0.979568,0.845538
0,TF-IDF,Vanilla XG Boost Classifier,0.953154,0.913799,0.829703,0.786085,0.887155,0.845144,0.888321,0.845701
0,TF-IDF,Basic ANN,0.961179,0.855081,0.992431,0.849863,0.976555,0.852464,0.974788,0.842432
0,TF-IDF,Basic LSTM,0.946053,0.877756,0.90068,0.82606,0.922809,0.851124,0.920276,0.845211
0,TF-IDF,Basic Bidirectional LSTM,0.927804,0.850031,0.933967,0.844065,0.930875,0.847037,0.92661,0.836711


In [37]:
result_backup_3 = result.copy()

In [38]:
# data_hl, feats_hl = hurt_lex_func(raw_data_cleaned)

In [39]:
data_hl = pd.read_csv("intermediate_data/hurtlex_features_undersampled.csv")

In [40]:
data_hl.head()

Unnamed: 0,id,comment_text_cleaned,toxic_flag,keep_flag,temp_label,ps_conservative,pa_conservative,ddf_conservative,ddp_conservative,asf_conservative,...,om_conservative,qas_conservative,psps_inclusive,paps_inclusive,ddfps_inclusive,ddpps_inclusive,asfps_inclusive,prps_inclusive,omps_inclusive,qasps_inclusive
0,0009801bd85e5806,The Mitsurugi point made no sense - why not ar...,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0011cc71398479c4,How could I post before the block expires? The...,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00128363e367d703,Not sure about a heading of 'Fight for Freedom...,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,001ffdcc3e7fb49c,Awesome! Then I'll simply disregard your notic...,0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0022cf8467ebc9fd,"A Bisexual, like a homosexual or a heterosexua...",0,1,1.0,0.0,0.0,0.0,0.0,0.0,...,0.006757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006757,0.0


In [41]:
feats_hl = list(data_hl.columns[3:])
len(feats_hl)

18

In [42]:
X_train_hl, X_test_hl, y_train_hl, y_test_hl = split_train_test(data_hl, feats_hl, target)
X_train_hl.shape, X_test_hl.shape, y_train_hl.shape, y_test_hl.shape

((24376, 18), (6094, 18), (24376,), (6094,))

In [43]:
%%time
result = main_function(X_train_hl, X_test_hl, y_train_hl, y_test_hl, 'HurtLex')

svc object created
svc object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 14min 15s


In [44]:
result

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,HurtLex,Vanilla Support Vector Classifier,0.530809,0.539219,1.0,1.0,0.693501,0.70064,0.530809,0.539219
0,HurtLex,Vanilla Random Forest Classifier,0.926323,0.79558,0.654919,0.56969,0.767329,0.663948,0.789178,0.689038
0,HurtLex,Vanilla XG Boost Classifier,0.854757,0.83451,0.589458,0.575472,0.69774,0.681196,0.728914,0.70955
0,HurtLex,Basic ANN,0.792594,0.806491,0.597187,0.604991,0.681153,0.691358,0.703233,0.70873
0,HurtLex,Basic LSTM,0.812819,0.825011,0.578252,0.588253,0.675759,0.6868,0.705448,0.710699
0,HurtLex,Basic Bidirectional LSTM,0.845809,0.85817,0.538836,0.54504,0.658295,0.666667,0.703069,0.706104


In [45]:
result_backup_4 = result.copy()

In [46]:
# data_p_api, feats_p_api = p_api_func(raw_data_cleaned)

In [47]:
data_p_api = pd.read_csv("intermediate_data/p_api_features_undersampled.csv")

In [48]:
data_p_api.head()

Unnamed: 0,id,comment_text_cleaned,toxic_flag,keep_flag,temp_label,TOXICITY,SEVERE_TOXICITY,IDENTITY_ATTACK,INSULT,PROFANITY,THREAT
0,0015f4aa35ebe9b5,pretty much everyone from warren county/surrou...,0,1,1.0,0.060385,0.024885,0.05363,0.050093,0.020538,0.057124
1,00169857adbc989b,"Hi Explicit, can you block O Fenian for edit-w...",0,1,1.0,0.338045,0.143365,0.271697,0.441597,0.063597,0.178338
2,001cadfd324f8087,""" As for your claims of """"stalking"""", that is ...",0,1,1.0,0.38668,0.133477,0.150032,0.48511,0.145672,0.056157
3,0030614cfd96d9d1,"In the same direction, is it really necessary ...",0,1,1.0,0.088925,0.033666,0.135616,0.072282,0.039141,0.055976
4,00316bcc0d1bc6e0,", December (UTC) You must not play Metal Gea...",0,1,1.0,0.331113,0.102389,0.066497,0.40259,0.091098,0.161816


In [49]:
feats_p_api = list(data_p_api.columns[-6:])
print(feats_p_api)

['TOXICITY', 'SEVERE_TOXICITY', 'IDENTITY_ATTACK', 'INSULT', 'PROFANITY', 'THREAT']


In [50]:
X_train_p_api, X_test_p_api, y_train_p_api, y_test_p_api = split_train_test(data_p_api, feats_p_api, target)
X_train_p_api.shape, X_test_p_api.shape, y_train_p_api.shape, y_test_p_api.shape

((24580, 6), (6145, 6), (24580,), (6145,))

In [51]:
%%time
result = main_function(X_train_p_api, X_test_p_api, y_train_p_api, y_test_p_api, 'Perspective API')

svc object created
svc object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 12min 55s


In [52]:
result

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,Perspective API,Vanilla Support Vector Classifier,0.940409,0.940199,0.946587,0.946776,0.943488,0.943476,0.940317,0.9393
0,Perspective API,Vanilla Random Forest Classifier,0.999923,0.941892,1.0,0.941606,0.999961,0.941749,0.999959,0.937673
0,Perspective API,Vanilla XG Boost Classifier,0.968512,0.940494,0.974801,0.937348,0.971647,0.938919,0.970057,0.934744
0,Perspective API,Basic ANN,0.943683,0.944884,0.94164,0.943735,0.94266,0.944309,0.939707,0.940439
0,Perspective API,Basic LSTM,0.928534,0.927939,0.95911,0.955596,0.943574,0.941564,0.939626,0.936534
0,Perspective API,Basic Bidirectional LSTM,0.947809,0.951062,0.936307,0.939781,0.942023,0.945388,0.939341,0.941904


In [53]:
result_backup_5 = result.copy()

In [54]:
# data_bert, feats_bert = bert_func(raw_data_cleaned)

In [55]:
data_bert = pd.read_csv("intermediate_data/model_input_data_bert_embedding_final.csv")

In [56]:
data_bert.head()

Unnamed: 0,id,comment_text_cleaned,toxic_flag,keep_flag,temp_label,feat_1,feat_2,feat_3,feat_4,feat_5,...,feat_759,feat_760,feat_761,feat_762,feat_763,feat_764,feat_765,feat_766,feat_767,feat_768
0,0015f4aa35ebe9b5,pretty much everyone from warren county/surrou...,0,1,1.0,0.019962,-0.25195,-0.303824,-0.214446,-0.179983,...,0.3274,-0.174128,0.292991,-0.049813,0.237916,-0.443509,0.100461,0.128288,-0.069559,0.238108
1,00169857adbc989b,"Hi Explicit, can you block O Fenian for edit-w...",0,1,1.0,-0.402784,0.145996,-0.379184,-0.277948,-0.151444,...,0.471309,-0.129264,0.126192,-0.274693,-0.047576,-0.498724,0.32937,0.022788,-0.051143,-0.196208
2,001cadfd324f8087,""" As for your claims of """"stalking"""", that is ...",0,1,1.0,0.380203,-0.043334,0.045272,-0.049082,0.021741,...,0.131988,0.066152,-0.086741,-0.198166,-0.053383,-0.469908,0.218493,0.055765,0.008645,-0.396121
3,0030614cfd96d9d1,"In the same direction, is it really necessary ...",0,1,1.0,-0.402784,0.145996,-0.379184,-0.277948,-0.151444,...,0.471309,-0.129264,0.126192,-0.274693,-0.047576,-0.498724,0.32937,0.022788,-0.051143,-0.196208
4,00316bcc0d1bc6e0,", December (UTC) You must not play Metal Gea...",0,1,1.0,0.145714,-0.089117,0.026449,0.003911,-0.066716,...,0.441608,-0.049409,-0.12736,-0.276097,0.085446,-0.468982,0.226885,0.070861,0.022175,0.060069


In [57]:
feats_bert = list(data_bert.columns[5:])
len(feats_bert)

768

In [58]:
X_train_bert, X_test_bert, y_train_bert, y_test_bert = split_train_test(data_bert, feats_bert, target)
X_train_bert.shape, X_test_bert.shape, y_train_bert.shape, y_test_bert.shape

((24580, 768), (6145, 768), (24580,), (6145,))

In [59]:
%%time
result = main_function(X_train_bert, X_test_bert, y_train_bert, y_test_bert, 'BERT')

svc object created
svc object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 34min 1s


In [60]:
result

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,BERT,Vanilla Support Vector Classifier,0.591788,0.597247,0.835511,0.831204,0.69284,0.695066,0.61009,0.609764
0,BERT,Vanilla Random Forest Classifier,0.591862,0.597201,0.835433,0.830596,0.692865,0.694823,0.610171,0.609601
0,BERT,Vanilla XG Boost Classifier,0.591873,0.597201,0.835356,0.830596,0.692845,0.694823,0.610171,0.609601
0,BERT,Basic ANN,0.592504,0.597173,0.827317,0.822384,0.690494,0.691914,0.609642,0.608137
0,BERT,Basic LSTM,0.591599,0.596518,0.828476,0.823297,0.690281,0.691797,0.608706,0.607486
0,BERT,Basic Bidirectional LSTM,0.590956,0.597174,0.838448,0.835462,0.693276,0.696501,0.60952,0.610415


In [61]:
result_backup_6 = result.copy()

In [62]:
%%time
data_dcv, feats_dcv = doc_to_vec_func(raw_data_cleaned)

Wall time: 1min 1s


In [63]:
print(feats_dcv)
data_dcv.shape

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'var_6', 'var_7', 'var_8', 'var_9', 'var_10', 'var_11', 'var_12', 'var_13', 'var_14', 'var_15', 'var_16', 'var_17', 'var_18', 'var_19', 'var_20']


(30624, 24)

In [64]:
# data_dcv = pd.read_csv("intermediate_data/doc2vec_features_undersampled.csv")

In [65]:
# data_dcv.head()

In [66]:
# feats_dcv = list(data_dcv.columns[4:])
# len(feats_dcv)

In [67]:
X_train_dcv, X_test_dcv, y_train_dcv, y_test_dcv = split_train_test(data_dcv, feats_dcv, target)
X_train_dcv.shape, X_test_dcv.shape, y_train_dcv.shape, y_test_dcv.shape

((24499, 20), (6125, 20), (24499,), (6125,))

In [68]:
%%time
result = main_function(X_train_dcv, X_test_dcv, y_train_dcv, y_test_dcv, 'Doc2vec')

svc object created
svc object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 15min 7s


In [69]:
result

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,Doc2vec,Vanilla Support Vector Classifier,0.628421,0.534987,0.856911,0.736137,0.725092,0.619646,0.656721,0.515755
0,Doc2vec,Vanilla Random Forest Classifier,1.0,0.545082,1.0,0.648385,1.0,0.592263,1.0,0.521633
0,Doc2vec,Vanilla XG Boost Classifier,0.820622,0.547089,0.904504,0.621268,0.860524,0.581823,0.845096,0.521469
0,Doc2vec,Basic ANN,0.528307,0.535837,1.0,1.0,0.691363,0.697778,0.528307,0.535837
0,Doc2vec,Basic LSTM,0.528307,0.535837,1.0,1.0,0.691363,0.697778,0.528307,0.535837
0,Doc2vec,Basic Bidirectional LSTM,0.528307,0.535837,1.0,1.0,0.691363,0.697778,0.528307,0.535837


In [70]:
result_backup_7 = result.copy()

In [74]:
# %%time
# data_empath, feats_empath = empath_func(raw_data_cleaned)

In [75]:
data_empath = pd.read_csv("intermediate_data/empath_features_balanced.csv")

In [76]:
print(data_empath.shape)
data_empath.head()

(30615, 23)


Unnamed: 0,id,comment_text_cleaned,toxic_flag,toxicity,severe_toxicity,threat,identity_hate,violence,valuable,hate,...,weakness,horror,swearing_terms,kill,exasperation,body,ridicule,disgust,anger,rage
0,0000997932d777bf,explanation why the edits made under my userna...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0005300084f90edc,""" fair use rationale for image:wonju.jpg thank...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000897889268bc93,redirect talk:voydan pop georgiev- chernodrinski,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000c0dfd995809fa,""" snowflakes are not always symmetrical! unde...",0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,001363e1dbe91225,i was able to post the above list so quickly b...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
feats_empath = list(data_empath.columns[3:])
len(feats_empath)

20

In [79]:
print(feats_empath)

['toxicity', 'severe_toxicity', 'threat', 'identity_hate', 'violence', 'valuable', 'hate', 'aggression', 'anticipation', 'crime', 'weakness', 'horror', 'swearing_terms', 'kill', 'exasperation', 'body', 'ridicule', 'disgust', 'anger', 'rage']


In [81]:
X_train_empath, X_test_empath, y_train_empath, y_test_empath = split_train_test(data_empath, feats_empath, target)
X_train_empath.shape, X_test_empath.shape, y_train_empath.shape, y_test_empath.shape

((24492, 20), (6123, 20), (24492,), (6123,))

In [82]:
%%time
result = main_function(X_train_empath, X_test_empath, y_train_empath, y_test_empath, 'Empath')

svc object created
svc object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
random forest classifier object created
random forest classifier object fitted
train prediction done
test prediction done
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 14min 41s


In [83]:
result

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,Empath,Vanilla Support Vector Classifier,0.870991,0.859163,0.469548,0.464526,0.610161,0.603017,0.682631,0.673363
0,Empath,Vanilla Random Forest Classifier,0.944119,0.819188,0.534697,0.475229,0.682732,0.60151,0.737139,0.663727
0,Empath,Vanilla XG Boost Classifier,0.902196,0.842473,0.50413,0.479205,0.646826,0.610916,0.708803,0.674016
0,Empath,Basic ANN,0.865095,0.854821,0.470243,0.466361,0.609291,0.603482,0.680998,0.672709
0,Empath,Basic LSTM,0.897926,0.899143,0.424392,0.417125,0.576371,0.569877,0.670015,0.663727
0,Empath,Basic Bidirectional LSTM,0.898755,0.890052,0.423466,0.415902,0.575686,0.566903,0.669811,0.660624


In [84]:
result_backup_8 = result.copy()

In [85]:
result_final = pd.concat([result_backup_1, result_backup_2, result_backup_3, result_backup_4, result_backup_5, result_backup_6, result_backup_7, result_backup_8], axis = 0)

In [87]:
result_final.to_csv("intermediate_data/result_comparison.csv", index = False)

In [88]:
result_final

Unnamed: 0,feature_extraction_method,model,train_precision,test_precision,train_recall,test_recall,train_f1,test_f1,train_accuuracy,test_accuracy
0,Universal Sentence Encoder,Vanilla Support Vector Classifier,0.629469,0.646593,0.708372,0.697894,0.666594,0.671265,0.625082,0.633867
0,Universal Sentence Encoder,Vanilla Random Forest Classifier,0.62965,0.646728,0.712465,0.702777,0.668502,0.673589,0.626144,0.635175
0,Universal Sentence Encoder,Vanilla XG Boost Classifier,0.680845,0.701217,0.582252,0.580104,0.627701,0.634937,0.634562,0.642694
0,Universal Sentence Encoder,Basic ANN,0.649451,0.671756,0.621563,0.617638,0.635201,0.643561,0.622262,0.63354
0,Universal Sentence Encoder,Basic LSTM,0.618404,0.632461,0.749459,0.73726,0.677654,0.680851,0.622753,0.629781
0,Universal Sentence Encoder,Basic Bidirectional LSTM,0.691739,0.711462,0.49861,0.494355,0.579507,0.583363,0.617154,0.621772
0,Sentence Transformer,Vanilla Support Vector Classifier,0.62928,0.647755,0.725363,0.71773,0.673914,0.68095,0.628596,0.639752
0,Sentence Transformer,Vanilla Random Forest Classifier,0.629213,0.648433,0.726599,0.719866,0.674409,0.682285,0.6288,0.640896
0,Sentence Transformer,Vanilla XG Boost Classifier,0.629179,0.64853,0.726753,0.720171,0.674455,0.682475,0.6288,0.641059
0,Sentence Transformer,Basic ANN,0.61869,0.633752,0.763902,0.754043,0.68367,0.688685,0.625981,0.634848
