In [None]:
# Import functions from local scripts
import sys
sys.path.insert(1, './scripts/development')
import scripts.development.preprocessing as pre

# Import data handling libraries
import pandas as pd
from scipy.sparse import csr_matrix, hstack

# Import metric functions
from sklearn.metrics import average_precision_score, classification_report

# Import I/O libraries
from pickle import load
import json

In [None]:
# Load both tf-idf vectorizers
with open("../data/modeling/tfidf_1.pkl", "rb") as tfidf_file:
    tfidf_vec_1 = load(tfidf_file)
    tfidf_file.close()
    
with open("../data/modeling/tfidf_2.pkl", "rb") as tfidf_file:
    tfidf_vec_2 = load(tfidf_file)
    tfidf_file.close()

In [None]:
# Load both models
with open("../data/modeling/model_1_wo_mf.pkl", "rb") as model_file:
    model_1 = load(model_file)
    model_file.close()
    
with open("../data/modeling/model_2_wo_mf.pkl", "rb") as model_file:
    model_2 = load(model_file)
    model_file.close()

In [None]:
# Load token dictionary and create token list
with open("../data/token_dictionary.json", "r") as token_dict_file:
    token_dictionary = json.load(token_dict_file)
    token_dict_file.close()

token_list = [token for key, val in token_dictionary.items() for token in val]

In [None]:
# Hand define a test set to try the models on
X_test = pd.Series([
    "Encomendei meus sapatos 3 semanas atrás. Eu estou sentindo falta deles. Devolva-os o mais rápido possível", 
    "donde esta mi pedido señor", 
    "Where is my order?",
    "Hi. I went to your site and I have seen that my command _TRACKING_NUMBER_ is not coming. I need it. Contact me at aaa.23@yahoo.com please",
    "tell me about my command __ORDER_NUMBER__", 
    "hi i want to register a coupon",
    "Hello I have ordered __COMPANY__ shoes (__PRODUCT_NAMES__) didn'tget a confirmation.",
    "Good day sir i got __PRODUCT_NAME__ on your site but not delivered until now",
    "Hi regarding my order __ORDER_NUMBER__ I just wanted to know if I can change __PRODUCT_NAME__ with __PRODUCT_NAME__"],
    name='text')

y_test = [1, 1, 1, 1, 1, 0, 1, 1, 0]

In [None]:
%%time
# Preprocess text to use it for prediction
X_test = X_test.apply(lambda x: pre.translate_to_en(x))
X_test = pre.preprocess_text_series(X_test, token_dictionary, token_list, False)
X_test

In [None]:
%%time
# Use vectorizers to obtain data we can predict on
text = X_test['text']
tfidf_text_1 = tfidf_vec_1.transform(text)
tfidf_text_2 = tfidf_vec_2.transform(text)

In [None]:
predicted_1 = model_1.predict(tfidf_text_1)
predicted_2 = model_2.predict(tfidf_text_2)

In [None]:
# Print average precision-recall AUC and classification report for first model with our dataset
print(average_precision_score(y_test, predicted_1), "\n", classification_report(y_test, predicted_1))

In [None]:
# Print average precision-recall AUC and classification report for second model with our dataset
print(average_precision_score(y_test, predicted_2), "\n", classification_report(y_test, predicted_2))

In [None]:
ha = token_list.copy()
ha

In [None]:
sorted(ha, key=len, reverse=True)

In [None]:
#TODO check again the modeling process since we changed preprocess function