# Perfect Classifier

Here we build a model that picks the majority class for each note to get a sense
for how good of a model we can build

In [1]:
import pandas as pd
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from collections import OrderedDict
from tqdm import tqdm
import nltk
import numpy as np
from sklearn.linear_model import LogisticRegression


from core_functions import evaluate_model



In [2]:
train_data = pd.read_csv('../data/processed/training_data.csv')
val_data = pd.read_csv('../data/processed/val_data.csv')
test_data = pd.read_csv('../data/processed/test_data.csv')

In [3]:
datasets = OrderedDict([('train', {'notes': train_data.iloc[:,:22].drop_duplicates().reset_index(drop=True),
                                   'notes_ratings': train_data}),
                        ('val', {'notes': val_data.iloc[:, :22].drop_duplicates().reset_index(drop=True),
                                 'notes_ratings': val_data}),
                        ('test', {'notes': test_data.iloc[:, :22].drop_duplicates().reset_index(drop=True),
                                  'notes_ratings': test_data})])

In [5]:

(train_data.groupby('noteId')
 .agg(frac_helpful = ('output', lambda y: y.sum()/len(y)))
 .reset_index())

Unnamed: 0,noteId,frac_helpful
0,1352796878438424576,0.571429
1,1353415873227177985,0.777778
2,1354602688097308676,1.000000
3,1354630645385846789,0.888889
4,1354635010423328769,1.000000
...,...,...
8856,1453101343430881296,0.333333
8857,1453111715743346692,0.636364
8858,1453123162892296200,0.000000
8859,1453131867545161733,1.000000


In [8]:
for name, data_dict in datasets.items():
    print(name)
    note_ratings = data_dict['notes_ratings']
    note_frac_helpful = (note_ratings.groupby('noteId')
                         .agg(frac_helpful = ('output', lambda y: y.sum()/len(y)))
                         .reset_index())
    note_ratings = note_ratings.merge(note_frac_helpful)

    predictions = (note_ratings['frac_helpful'] > 0.5).astype(int)
    p_hat = note_ratings['frac_helpful']
    observations = note_ratings['output']
    scaling = note_ratings['scaling']
    performance_dict = evaluate_model(predictions, p_hat, observations,
                                      scaling, name)
    data_dict['Y_hat'] = predictions
    data_dict['p_hat'] = p_hat
    data_dict['performance'] = performance_dict

train
val
test


In [9]:
performance_metrics = []
for name, data_dict in datasets.items():
    performance_metrics.append(data_dict['performance'])
performance_metrics_df = pd.DataFrame(performance_metrics)
performance_metrics_df

Unnamed: 0,accuracy,f1,precision,recall,roc-auc,avg_precision,name
0,0.824039,0.861729,0.870714,0.852927,0.9144,0.950134,train
1,0.816166,0.854324,0.858143,0.850538,0.907101,0.943484,val
2,0.827049,0.86343,0.875387,0.851796,0.919277,0.953597,test
