In [18]:
import pandas as pd
import numpy as np

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score, precision_score, recall_score, f1_score

In [7]:
# AZURE EMBEDDINGS
data = pd.read_csv("../2.Feature extraction/embeddings/Azure_text-similarity-ada-002_embeddings.csv", index_col=0)
data.head(2)

# OPENAI EMBEDDINGS
# data = pd.read_csv("../2.Feature extraction/embeddings/OpenAI_text-similarity-ada-002_embeddings.csv.csv", index_col=0)
# data.head(2)

### 1.Preprocesing features

In [None]:
# Choose asr preprocess type to use
embedding_to_use = "embedding_cleaned_asr"
embeddings = ["embedding_asr","embedding_cleaned_asr","embedding_cleaned_asr_without_stopw"]
embeddings.remove(embedding_to_use)
data = data.drop(columns=embeddings, axis = 1)

In [None]:
# Suffix column
def get_suffix_from_intent(intent: str) -> str:
    return intent.split("-")[-1].strip()
data["suffix"] = data.intent.apply(lambda x: get_suffix_from_intent(x))

In [8]:
# Embeddings
data["embedding"] = data[embedding_to_use].apply(eval).apply(np.array)
tags = data['embedding'].apply(pd.Series)
features = tags.rename(columns = lambda x : 'embedding_feature_' + str(x))
result = pd.concat([data, features], axis=1)
df = result.drop(columns=["embedding"], axis=1)

In [9]:
df.dtypes

asr                         object
label                       object
asr_updated                 object
intent                      object
intent_updated              object
                            ...   
embedding_feature_12283    float64
embedding_feature_12284    float64
embedding_feature_12285    float64
embedding_feature_12286    float64
embedding_feature_12287    float64
Length: 12302, dtype: object

### 2.Create X and y

In [10]:
# Create sets
X = df.drop(columns=["label"], axis = 1)
y = df["label"]

In [11]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(list(le.classes_)) 
display(y_encoded)

['no open', 'open']


array([0, 1, 1, ..., 0, 0, 0])

In [12]:
# Divide data in training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size = 0.1)

In [13]:
# Feature engineering

# Get features based on datatypes
# categorical -> one hot encoding
# numerical -> scaler

numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(X_train)
categorical_columns = categorical_columns_selector(X_train)

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", categorical_preprocessor, categorical_columns),
        ("standard_scaler", numerical_preprocessor, numerical_columns),
    ]
)

# Train data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_train_preprocessed = pd.DataFrame(data=X_train_preprocessed, columns=preprocessor.get_feature_names_out())
# Test data
X_test_preprocessed = preprocessor.transform(X_test)
X_test_preprocessed = pd.DataFrame(data=X_test_preprocessed, columns=preprocessor.get_feature_names_out())

# print(X_train.shape,X_test.shape)
# print(X_train_preprocessed.shape, X_test_preprocessed.shape)
# print(y_train.shape, y_test.shape)

### 3.Train Model

In [15]:
model = SVC()
# Validate parameters
parameters = {
    'kernel':('linear', 'rbf'), 
    'C': [0.1, 1, 10, 100, 1000], 
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
    }

clf = GridSearchCV(model,parameters,cv = 10, n_jobs = -1)
clf.fit(X_train_preprocessed, y_train)

best_parameters = clf.best_params_ 
best_score = clf.best_score_ 
best_estimator = clf.best_estimator_

print(best_estimator)
print(parameters)

SVC(C=1, gamma=0.0001)
{'kernel': ('linear', 'rbf'), 'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]}


In [17]:
print(best_parameters)

{'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}


In [16]:
from joblib import dump, load

dump(clf, "SVM_model_1.joblib")

['SVM_model_1.joblib']

### 4.Predict

Some experiment results.

#### Experiment 14: OpenAI embeddings with Davinci model and preprocessed asr with stopwords

In [24]:
estimator_model_14 = load("SVM_model_14.joblib")
print(estimator_model)

GridSearchCV(cv=10, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ('linear', 'rbf')})


In [20]:
y_pred = estimator_model_14.predict(X_test_preprocessed)

print('Test Accuracy : %.2f'%accuracy_score(y_test,y_pred))
print('Test Precision : %.2f'%precision_score(y_test,y_pred))
print('Test Recall : %.2f'%recall_score(y_test,y_pred))
print('Test F1 Score : %.2f'%f1_score(y_test,y_pred))

print('\nConfusion Matrix : ')
print(confusion_matrix(y_test,y_pred))

print('\nClassification Report : ')
print(classification_report(y_test,y_pred))

Test Accuracy : 0.70
Test Precision : 0.69
Test Recall : 0.71
Test F1 Score : 0.70

Confusion Matrix : 
[[41 18]
 [16 40]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.72      0.69      0.71        59
           1       0.69      0.71      0.70        56

    accuracy                           0.70       115
   macro avg       0.70      0.70      0.70       115
weighted avg       0.70      0.70      0.70       115

