In [46]:
%load_ext autoreload
%autoreload 2

from excel_matches_livingscores import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load manually tagged samples

In [2]:
st1 = pd.read_excel('matches_sample1.xls', encoding = 'utf8')
st1 = st1[st1.match.notnull()]

# train classifier

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy.sparse import hstack
from sklearn.metrics import classification_report

In [21]:
def evaluate_model(clf, X_train, X_test, y_train, y_test):
    y_true, y_pred = y_train, clf.predict(X_train)

    print("Detailed classification report:\n")
    print("Scores on training set.\n")
    print(classification_report(y_true, y_pred))

    y_true, y_pred = y_test, clf.predict(X_test)
    print("Scores on test set.\n")
    print(classification_report(y_true, y_pred))

In [8]:
amdh_full = pd.read_excel('./Amadeus All Properties - FEB 2017 17022017 .xlsx', header=0, index_col='PROPERTY_CODE', skiprows=1)

In [10]:
bkgh = load_booking()

In [41]:
amdh = load_amadeus_from_db()

In [11]:
# NOTE: create these encoders in
# http://35.156.49.99:8888/notebooks/idmatching/Automatic%20matching%20with%20feature%20extraction%20and%20candidates%20classifier.ipynb#
# otherwise there are some chains missing
def create_chain_encoders():
    amd_chain = list(norm_text(amdh_full.CHAIN_NAME)) + list(norm_text(amdh.chain))
    bkg_chain = norm_text(bkgh.chain)

    le_chain = LabelEncoder()
    le_chain.fit(amd_chain)
    ohe = OneHotEncoder()
    ohe.fit(le_chain.transform(amd_chain).reshape(-1,1))

    le_chain_bk = LabelEncoder()
    le_chain_bk.fit(bkg_chain)
    ohe_bk = OneHotEncoder()
    ohe_bk.fit(le_chain_bk.transform(bkg_chain).reshape(-1,1))
    
    with open('chain_encoders.pickle', 'wb') as f:
        pickle.dump((le_chain, ohe, le_chain_bk, ohe_bk), f)

In [47]:
create_chain_encoders()

In [17]:
def load_chain_encoders():
    with open('chain_encoders.pickle', 'rb') as f:
        le_chain, ohe, le_chain_bk, ohe_bk = pickle.load(f)
    return le_chain, ohe, le_chain_bk, ohe_bk

In [51]:
def pre_process(Xy):
    Xy.chain.fillna('', inplace=True)
    Xy.chain_bkg.fillna('', inplace=True)
    Xy.name.fillna('', inplace=True)
    Xy.name_bkg.fillna('', inplace=True)
    Xy.chain_included.fillna(False, inplace=True)
    Xy.name_included.fillna(False, inplace=True)

    Xnum = Xy[[u'dist', u'name_sim', u'name_sim_sw', u'chain_sim', u'chain_sim_sw',
           u'name_included', u'chain_included']]

    Xcat = Xy[[u'chain', u'chain_bkg']]

    le_chain, ohe, le_chain_bk, ohe_bk = load_chain_encoders()
    Xchain = le_chain.transform(norm_text(Xy.chain))
    Xchain = ohe.transform(Xchain.reshape(-1,1))

    Xchain_bkg = le_chain_bk.transform(norm_text(Xy.chain_bkg))
    Xchain_bkg = ohe_bk.transform(Xchain_bkg.reshape(-1,1))

    X = hstack((Xnum.astype(float), Xchain, Xchain_bkg))
    y = Xy.match
    
    return X, y

In [52]:
X, y = pre_process(st1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

params = dict(
    max_depth=[5,10,20,50,100],
    n_estimators=[3, 4, 5, 10, 15, 20],
    class_weight=['balanced', 'balanced_subsample', None]
    # max_features=[25, 50, 75, 100, 150]
    # max_features = [5, 10, 15]
)

clf = GridSearchCV(
    RandomForestClassifier(),  
    param_grid=params,  # parameters to tune via cross validation
    refit=True,  # fit using all data, on the best detected classifier
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='f1',  # what score are we optimizing?
    cv=3,  # what type of cross validation to use
)

clf.fit(X_train, y_train)

clf.best_params_

{'class_weight': 'balanced', 'max_depth': 100, 'n_estimators': 20}

In [53]:
evaluate_model(clf, X_train, X_test, y_train, y_test)

Detailed classification report:

Scores on training set.

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        26
        1.0       1.00      1.00      1.00       264

avg / total       1.00      1.00      1.00       290

Scores on test set.

             precision    recall  f1-score   support

        0.0       1.00      0.78      0.88        18
        1.0       0.96      1.00      0.98       107

avg / total       0.97      0.97      0.97       125



In [193]:
with open('rdf_matches.pickle','wb') as f:
    pickle.dump(clf, f)

In [108]:
with open('rdf_matches.pickle','rb') as f:
    clf = pickle.load(f)

 # extend classifier to larger sample

Now we use this classifier to label more examples

In [197]:
Xy = s[~s.match.notnull()]
Xy = Xy.sample(500)
X, _ = pre_process(Xy)
y = clf.predict(X)
Xy.match = y
Xy.head()

In [210]:
Xy.to_excel('matches_sample2.xls')

## check labels manually...

## ... then load and fit again

In [54]:
st2 = pd.read_excel('matches_sample2.xls', encoding = 'utf8')

In [55]:
st = pd.concat([st1,st2])

In [56]:
X, y = pre_process(st)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

params = dict(
    max_depth=[20, 50, 100, 150],
    n_estimators=[5, 10, 15, 20, 30, 50],
    class_weight=['balanced', 'balanced_subsample', None]
    # max_features=[25, 50, 75, 100, 150]
    # max_features = [5, 10, 15]
)

clf = GridSearchCV(
    RandomForestClassifier(),  
    param_grid=params,  # parameters to tune via cross validation
    refit=True,  # fit using all data, on the best detected classifier
    n_jobs=-1,  # number of cores to use for parallelization; -1 for "all cores"
    scoring='f1',  # what score are we optimizing?
    cv=3,  # what type of cross validation to use
)

clf.fit(X_train, y_train)

clf.best_params_

{'class_weight': 'balanced_subsample', 'max_depth': 20, 'n_estimators': 50}

In [62]:
evaluate_model(clf, X_train, X_test, y_train, y_test)

Detailed classification report:

Scores on training set.

             precision    recall  f1-score   support

        0.0       0.92      1.00      0.96        69
        1.0       1.00      0.99      1.00       617

avg / total       0.99      0.99      0.99       686

Scores on test set.

             precision    recall  f1-score   support

        0.0       1.00      0.84      0.91        25
        1.0       0.98      1.00      0.99       204

avg / total       0.98      0.98      0.98       229



In [63]:
with open('rdf_matches.pickle','wb') as f:
    pickle.dump(clf, f)