In [2]:
import pickle
import random

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
with open('matches.pickle', 'rb') as f:
  data = pickle.loads(f.read())

In [4]:
matched_pairs = set((d['member_id'], d['request_id']) for d in data)
member_ids = list({d['member_id'] for d in data})
request_ids = list({d['request_id'] for d in data})

In [5]:
auto_wrong = [d for d in data if d['mismatch'] is True and \
    d['auto_generated'] is True]
auto_right = [d for d in data if d['mismatch'] is False and \
    d['auto_generated'] is True]
manual_right = [d for d in data if d['mismatch'] is False and \
    d['auto_generated'] is False]

In [6]:
random_wrong = []
while True:
  member_id = member_ids[random.randint(0, len(member_ids) - 1)]
  request_id = request_ids[random.randint(0, len(request_ids) - 1)]
  if (member_id, request_id) in matched_pairs:
    continue
  random_wrong.append({
      'member_id': member_id,
      'request_id': request_id,
  })
  matched_pairs.add((member_id, request_id))

  if len(random_wrong) == 40000:
    break

In [19]:
X_matches = auto_wrong + auto_right + manual_right + random_wrong
y = [0] * len(auto_wrong) + [1] * len(auto_right) + [1] * len(manual_right) + \
    [0] * len(random_wrong)
baseline_pred = [1] * len(auto_wrong) + [1] * len(auto_right) + \
    [0] * len(manual_right) + [0] * len(random_wrong)

y = np.array(y)
baseline_pred = np.array(baseline_pred)

In [11]:
with open('encodings.pickle', 'rb') as f:
  member_encodings, request_encodings = pickle.loads(f.read())

In [12]:
X = []
for row in X_matches:
  X.append(member_encodings[row['member_id']] * \
      request_encodings[row['request_id']])

X = np.array(X)

In [20]:
X_train, X_test, y_train, y_test, _, bp_test = \
    train_test_split(X, y, baseline_pred, random_state=42, test_size=1000)

In [26]:
model = GradientBoostingClassifier(n_estimators=5, learning_rate=0.1, \
    max_depth=3)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print('baseline:', sum(bp_test == y_test)/len(preds))
print('new model:', sum(preds == y_test)/len(preds))

KeyboardInterrupt: 

In [28]:
y_train, y_test, _, bp_test = \
    train_test_split(y, baseline_pred, random_state=42, test_size=1000)

In [31]:
bp_test.shape

(1000,)