# First Simple Model

In [55]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# load the jsonl files
train_df = pd.read_json('../data/raw/train.jsonl', lines=True, )
dev_seen_df = pd.read_json('../data/raw/dev_seen.jsonl', lines=True)
dev_unseen_df = pd.read_json('../data/raw/dev_unseen.jsonl', lines=True)
test_seen_df = pd.read_json('../data/raw/test_seen.jsonl', lines=True)
test_unseen_df = pd.read_json('../data/raw/test_unseen.jsonl', lines=True)

### Split

In [3]:
# create train set
train_df = train_df.set_index('id')

# validation set
val_df = pd.concat([dev_seen_df, dev_unseen_df, test_seen_df])
val_df = val_df.drop_duplicates(subset='id')
val_df = val_df.set_index('id')

# holdout set
hold_df = test_unseen_df.set_index('id')

In [34]:
# confirm number of images
ids = list(train_df.index)+list(val_df.index)+list(hold_df.index)
assert(len(ids) == len(set(ids)))

# display train, validation, holdout sizes
print(f"There are {len(ids)} images.")
print(f"Train size:\t{len(train_df) / len(ids): .2%}")
print(f"Validation size:{len(val_df) / len(ids): .2%}")
print(f"Holdout size:\t{len(hold_df) / len(ids): .2%}")

There are 12140 images.
Train size:	 70.02%
Validation size: 13.51%
Holdout size:	 16.47%


In [38]:
# create X, y
X_train, X_val, X_hold = train_df['text'], val_df['text'], hold_df['text']
y_train, y_val, y_hold = train_df['label'], val_df['label'], hold_df['label']

## Dummy Baseline

In [58]:
# instantiate, fit, score
dummy = DummyClassifier(strategy='stratified', random_state=42)
dummy.fit(X_train_vec, y_train)
dummy.score(X_val_vec, y_val)

0.526219512195122

## Baseline Model - Bag of Words

### Vectorize

In [46]:
# instantiate vectorizer
vectorizer = CountVectorizer()

# fit and transform on train
X_train_vec = vectorizer.fit_transform(X_train)

# transform val
X_val_vec = vectorizer.transform(X_val)

### Naive Bayes Classifier

In [61]:
# instantiate classifier
clf = MultinomialNB()

# fit and score
clf.fit(X_train_vec, y_train)
clf.score(X_val_vec, y_val)

0.5908536585365853

In [62]:
# roc auc
roc_auc_score(y_val, clf.predict_proba(X_val_vec)[:, 1])

0.5937144540060194