In [68]:
import json, random
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, cross_validate,StratifiedKFold,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import precision_score,recall_score

### Step 1: Loading data

In [19]:
train = []
val = []
test = []
with open('/Users/yi/Documents/GitHub/oos-eval/data/data_full.json') as json_file:
    data = json.load(json_file)
    for d in data['train']:
        train.append(d)
    for d in data['val']:
        val.append(d)
    for d in data['test']:
        test.append(d)

### Step 2: Cleaning and preprocessing data

In [45]:
val_label = []
for t in train:
    val_label.append(t[1])

In [48]:
labels = list(set(val_label))
selected_labels = []
for i in range(20):
    ind = random.randint(0,len(labels)-1)
    selected_labels.append(labels[ind])

In [50]:
new_train = []
new_val = []
new_test = []
for d in train:
    if d[1] in selected_labels:
        new_train.append(d)
for d in val:
    if d[1] in selected_labels:
        new_val.append(d)
for d in test:
    if d[1] in selected_labels:
        new_test.append(d)

In [51]:
train_set = pd.DataFrame(new_train,columns =['Data', 'Label'])
val_set = pd.DataFrame(new_val,columns =['Data', 'Label'])
test_set = pd.DataFrame(new_test,columns =['Data', 'Label'])

### Step 3: Creating and training model

In [62]:
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(penalty='l2',multi_class='multinomial',solver = 'lbfgs'))
X_train = train_set['Data']
X_val = val_set['Data']
X_test = test_set['Data']
Y_train = train_set['Label']
Y_val = val_set['Label']
Y_test = test_set['Label']

In [91]:
pipeline.fit(X_train, Y_train)
y_pred_class = pipeline.predict(X_val)
print(classification_report(Y_val, y_pred_class))

                       precision    recall  f1-score   support

             bill_due       1.00      1.00      1.00        20
   cancel_reservation       1.00      1.00      1.00        20
        card_declined       0.95      1.00      0.98        20
        change_accent       1.00      0.75      0.86        20
      change_language       0.87      1.00      0.93        20
      meal_suggestion       1.00      0.95      0.97        20
restaurant_suggestion       0.95      1.00      0.98        20
      rewards_balance       1.00      1.00      1.00        20
           smart_home       1.00      1.00      1.00        20
     spending_history       1.00      1.00      1.00        20
            tell_joke       1.00      1.00      1.00        20
                 time       1.00      1.00      1.00        20
            user_name       0.86      0.90      0.88        20
    what_is_your_name       0.90      0.90      0.90        20
   where_are_you_from       0.91      1.00      0.95  

In [86]:
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())
params = {'logisticregression__penalty' : ['l1', 'l2'],
    'logisticregression__C' : np.logspace(-4, 4, 20),
    'logisticregression__solver' : ['liblinear','lbfgs']}
clf = GridSearchCV(pipeline, param_grid = params, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(list(X_train)+list(X_val), list(Y_train)+list(Y_val))

Fitting 5 folds for each of 80 candidates, totalling 400 fits


 0.89305556 0.89074074 0.05555556        nan 0.89768519 0.91481481
 0.05555556        nan 0.90277778 0.91527778 0.05555556        nan
 0.90648148 0.91574074 0.05555556        nan 0.91203704 0.91527778
 0.10833333        nan 0.91851852 0.91990741 0.64814815        nan
 0.92916667 0.93055556 0.82314815        nan 0.93842593 0.93657407
 0.9125            nan 0.95046296 0.95       0.93981481        nan
 0.95462963 0.95555556 0.94953704        nan 0.96018519 0.95740741
 0.95138889        nan 0.9625     0.95833333 0.95              nan
 0.96157407 0.95925926 0.95046296        nan 0.96296296 0.96064815
 0.95092593        nan 0.96296296 0.96157407 0.94907407        nan
 0.96018519 0.9587963  0.95324074        nan 0.9587963  0.95833333
 0.95046296        nan 0.95694444 0.95648148 0.94953704        nan
 0.95787037 0.95509259]


### Step 4: Evaluating performance

In [89]:
y_pred_class = best_clf.predict(X_test)
print(classification_report(Y_test, y_pred_class))
print('Precision score :',precision_score(Y_test, y_pred_class,average='weighted'))
print('Recall score  :',recall_score(Y_test, y_pred_class,average='weighted'))

                       precision    recall  f1-score   support

             bill_due       0.97      1.00      0.98        30
   cancel_reservation       1.00      1.00      1.00        30
        card_declined       1.00      1.00      1.00        30
        change_accent       1.00      1.00      1.00        30
      change_language       0.93      0.93      0.93        30
      meal_suggestion       0.91      0.97      0.94        30
restaurant_suggestion       1.00      0.93      0.97        30
      rewards_balance       1.00      1.00      1.00        30
           smart_home       1.00      1.00      1.00        30
     spending_history       1.00      0.90      0.95        30
            tell_joke       0.97      1.00      0.98        30
                 time       1.00      1.00      1.00        30
            user_name       0.90      0.93      0.92        30
    what_is_your_name       0.84      0.90      0.87        30
   where_are_you_from       0.93      0.93      0.93  

### Step 5: Saving model

In [92]:
import pickle
f = open('classifier.sk','wb')
pickle.dump(pipeline,f)
f.close()

### Discussion Questions:
- I choice to use TfidfVectorizer to precess the text input. I think this is a better method compare to CountVectorizer because Tf-idf will weight more on less common words which may be more important for our classification.
- Then I choice logistic regression to do this multi-class classification. Based on my experience with this kind of problems, I think logistic regression is always a good starting model. It is easy and quick to train and use, and it have a good result in most cases.
- The model architecture is simply a Tfidf Vectorizer followed by loglogistic regression classifier. I used cross validation and hyperparameter tuning to improve the performance.
- Since it is a classification problem, I use accuracy, precision, recall, and f1 score to evaluate the model. The weighted average accuracy is 0.96, weighted average precision is 0.9569, and weighted average recall is 0.9555. Therefore the f1 score = 2 * Precision * Recall / Presicion + Recall = 0.9562
- There are potential improvement can be done in the future. I could try more classification models to pick the best model and extend the classifier to more classes and even out-of-scope classes.