In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import openai

### 1.Load pre-trained model

In [2]:
# Load model
xgboost_model = XGBClassifier()
xgboost_model.load_model("../3.Clasification/xgboost_final_model_Azure.json")

### 2.Preprocess features and compute embeddings

In [3]:
# Funciones para limpiar/calcular embeddings
from clean_asr_service import CleanASRService
cleanASRservice = CleanASRService()

openai.api_type = "azure"
openai.api_base = ""
openai.api_version = "2023-03-15-preview"
openai.api_key = ""

# Embeddings
def compute_embeddings(df) :
    df["embeddings"] = df["cleaned_asr"].apply(lambda x: get_embedding_azure(x, embedding_model=""))
    return df


def get_embedding_azure(text, embedding_model):
    response = openai.Embedding.create(
        input=text,
        engine=embedding_model,
        deployment_id = ""
    )
    return response['data'][0]['embedding']


# Suffix/flow/asr_len
# Preprocess data
# Suffix column
def get_suffix_from_intent(intent: str) -> str:
    return norm_str(intent.split("-")[-1].strip())

def get_flow_from_intent(intent: str) -> str:
    return norm_str(intent.split("-")[0].strip())

def norm_str(x:str) -> str:
    return x.lower().strip()

def compute_asr_len(asr: str) -> int:
    try: 
        n_words = len(set(norm_str(asr).split()))
    except:
        n_words = 0
    return n_words


In [5]:
### Request 

df = pd.DataFrame(request)

#### Feature preprocessing ####
df["asr_len"] = df.asr.apply(lambda x: compute_asr_len(x))
df["flow"] = df.intent.apply(lambda x: get_flow_from_intent(x))
df["suffix"] = df.intent.apply(lambda x: get_suffix_from_intent(x))
df["cleaned_asr"] = df.asr.apply(lambda x: cleanASRservice.execute(x, delete_stopwords=False))
df = compute_embeddings(df)
tags = df['embeddings'].apply(pd.Series)
features = tags.rename(columns = lambda x : 'embedding_feature_' + str(x))
result = pd.concat([df, features], axis=1)
X = result.drop(columns=["embeddings", "asr", "cleaned_asr", "intent"], axis=1)

# Extract text features -> convert to category
cats = X.select_dtypes(exclude=np.number).columns.tolist()
for col in cats:
    X[col] = X[col].astype('category')

# Order features
cols_when_model_builds = xgboost_model.get_booster().feature_names
X = X[cols_when_model_builds]

### 3.Predict

In [6]:
### Predict -> 0: No open, 1: Open -> Lo sé por el label_encoder que he usado al entrenar el modelo
y_pred = xgboost_model.predict(X)

In [7]:
# Boolean output
def handle_model_output(predictions: list) -> list:
    return [True if pred == 1 else False for pred in predictions]

output = handle_model_output(y_pred)

In [8]:
output

[False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True]