In [1]:
# Import functions from local scripts
import sys
sys.path.insert(1, './scripts/development')
import scripts.development.preprocessing as pre

# Import data handling libraries
import pandas as pd
from scipy.sparse import csr_matrix, hstack

# Import metric functions
from sklearn.metrics import average_precision_score, classification_report

# Import I/O libraries
from pickle import load
import json

In [34]:
# Load both tf-idf vectorizers
with open("../data/modeling/tfidf_1.pkl", "rb") as tfidf_file:
    tfidf_vec_1 = load(tfidf_file)
    tfidf_file.close()
    
with open("../data/modeling/tfidf_2.pkl", "rb") as tfidf_file:
    tfidf_vec_2 = load(tfidf_file)
    tfidf_file.close()

In [35]:
# Load both scalers
with open("../data/modeling/scaler_1.pkl", "rb") as tfidf_file:
    scaler_1 = load(tfidf_file)
    tfidf_file.close()
    
with open("../data/modeling/scaler_2.pkl", "rb") as tfidf_file:
    scaler_2 = load(tfidf_file)
    tfidf_file.close()

In [36]:
# Load both models
with open("../data/modeling/model_1_with_mf.pkl", "rb") as model_file:
    model_1 = load(model_file)
    model_file.close()
    
with open("../data/modeling/model_2_with_mf.pkl", "rb") as model_file:
    model_2 = load(model_file)
    model_file.close()

In [20]:
# Load token dictionary
with open("../data/token_dictionary.json", "r") as token_dict_file:
    token_dictionary = json.load(token_dict_file)
    token_dict_file.close()

In [37]:
# Hand define a test set to try the models on
X_test = pd.Series([
    "Encomendei meus sapatos 3 semanas atrás. Eu estou sentindo falta deles. Devolva-os o mais rápido possível", 
    "donde esta mi pedido señor", 
    "Where is my order?",
    "Hi. I went to your site and I have seen that my command _TRACKING_NUMBER_ is not coming. I need it. Contact me at aaa.23@yahoo.com please",
    "tell me about my command __ORDER_NUMBER__", 
    "hi i want to register a coupon",
    "Hello I have ordered __COMPANY__ shoes (__PRODUCT_NAMES__) didn'tget a confirmation.",
    "Good day sir i got __PRODUCT_NAME__ on your site but not delivered until now",
    "Hi regarding my order __ORDER_NUMBER__ I just wanted to know if I can change __PRODUCT_NAME__ with __PRODUCT_NAME__"],
    name='text')

y_test = [1, 1, 1, 1, 1, 0, 1, 1, 0]

In [38]:
%%time
# Preprocess text to use it for prediction
X_test = X_test.apply(lambda x: pre.translate_to_en(x))
X_test = pre.preprocess_text_series(X_test, token_dictionary, True)
X_test[['name_count', 'product_count', 'order_count', 'tracking_number_count', 'email_count', 'company_name_count']]

CPU times: total: 93.8 ms
Wall time: 351 ms


Unnamed: 0,name_count,product_count,order_count,tracking_number_count,email_count,company_name_count
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,1,1,0
4,0,0,1,0,0,0
5,0,0,0,0,0,0
6,0,1,0,0,0,1
7,0,1,0,0,0,0
8,0,2,1,0,0,0


In [39]:
%%time
# Use vectorizers to obtain data we can predict on
text = X_test['text']
tfidf_text_1 = tfidf_vec_1.transform(text)
tfidf_text_2 = tfidf_vec_2.transform(text)

m_feats = X_test.drop(columns='text')
m_feats_1 = pd.DataFrame(scaler_1.transform(m_feats), index = m_feats.index.values)
m_feats_2 = pd.DataFrame(scaler_2.transform(m_feats), index = m_feats.index.values)

tfidf_text_1 = hstack((tfidf_text_1, csr_matrix(m_feats_1)))
tfidf_text_2 = hstack((tfidf_text_2, csr_matrix(m_feats_2)))

CPU times: total: 15.6 ms
Wall time: 13 ms


In [43]:
predicted_1 = model_1.predict(tfidf_text_1)
predicted_2 = model_2.predict(tfidf_text_2)

In [44]:
# Print average precision-recall AUC and classification report for first model with our dataset
print(average_precision_score(y_test, predicted_1), "\n", classification_report(y_test, predicted_1))

0.9682539682539683 
               precision    recall  f1-score   support

           0       0.67      1.00      0.80         2
           1       1.00      0.86      0.92         7

    accuracy                           0.89         9
   macro avg       0.83      0.93      0.86         9
weighted avg       0.93      0.89      0.90         9



In [45]:
# Print average precision-recall AUC and classification report for second model with our dataset
print(average_precision_score(y_test, predicted_2), "\n", classification_report(y_test, predicted_2))

0.873015873015873 
               precision    recall  f1-score   support

           0       0.33      1.00      0.50         2
           1       1.00      0.43      0.60         7

    accuracy                           0.56         9
   macro avg       0.67      0.71      0.55         9
weighted avg       0.85      0.56      0.58         9



Both models were tested on a small self made dataset and