In [1]:
import json, random
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, cross_validate,StratifiedKFold,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import precision_score,recall_score

### Step 1: Loading data

In [2]:
train = []
val = []
test = []
with open('data/data_full.json') as json_file:
    data = json.load(json_file)
    for d in data['train']:
        train.append(d)
    for d in data['val']:
        val.append(d)
    for d in data['test']:
        test.append(d)

### Step 2: Cleaning and preprocessing data

In [3]:
val_label = []
for t in train:
    val_label.append(t[1])

In [4]:
labels = list(set(val_label))
selected_labels = []
for i in range(20):
    ind = random.randint(0,len(labels)-1)
    selected_labels.append(labels[ind])

In [5]:
new_train = []
new_val = []
new_test = []
for d in train:
    if d[1] in selected_labels:
        new_train.append(d)
for d in val:
    if d[1] in selected_labels:
        new_val.append(d)
for d in test:
    if d[1] in selected_labels:
        new_test.append(d)

In [6]:
train_set = pd.DataFrame(new_train,columns =['Data', 'Label'])
val_set = pd.DataFrame(new_val,columns =['Data', 'Label'])
test_set = pd.DataFrame(new_test,columns =['Data', 'Label'])

### Step 3: Creating and training model

In [7]:
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(penalty='l2',multi_class='multinomial',solver = 'lbfgs'))
X_train = train_set['Data']
X_val = val_set['Data']
X_test = test_set['Data']
Y_train = train_set['Label']
Y_val = val_set['Label']
Y_test = test_set['Label']

In [8]:
pipeline.fit(X_train, Y_train)
y_pred_class = pipeline.predict(X_val)
print(classification_report(Y_val, y_pred_class))

                     precision    recall  f1-score   support

                apr       0.95      1.00      0.98        20
         calculator       0.95      0.95      0.95        20
 cancel_reservation       1.00      1.00      1.00        20
           carry_on       1.00      1.00      1.00        20
       credit_score       1.00      0.95      0.97        20
     freeze_account       1.00      1.00      1.00        20
            goodbye       0.91      1.00      0.95        20
    how_old_are_you       1.00      1.00      1.00        20
   ingredients_list       1.00      1.00      1.00        20
              maybe       0.94      0.80      0.86        20
   meeting_schedule       0.91      1.00      0.95        20
          next_song       1.00      1.00      1.00        20
    oil_change_when       1.00      1.00      1.00        20
             recipe       1.00      1.00      1.00        20
             repeat       0.90      0.90      0.90        20
          tell_joke    

In [9]:
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())
params = {'logisticregression__penalty' : ['l1', 'l2'],
    'logisticregression__C' : np.logspace(-4, 4, 20),
    'logisticregression__solver' : ['liblinear','lbfgs']}
clf = GridSearchCV(pipeline, param_grid = params, cv = 5, verbose=True, n_jobs=-1)
best_clf = clf.fit(list(X_train)+list(X_val), list(Y_train)+list(Y_val))

Fitting 5 folds for each of 80 candidates, totalling 400 fits


 0.91929825 0.91842105 0.05263158        nan 0.92192982 0.92982456
 0.05263158        nan 0.9254386  0.93026316 0.05263158        nan
 0.92894737 0.93114035 0.05263158        nan 0.93245614 0.93289474
 0.175             nan 0.93377193 0.93552632 0.73289474        nan
 0.94298246 0.94385965 0.86666667        nan 0.95307018 0.95350877
 0.91315789        nan 0.95701754 0.95701754 0.94605263        nan
 0.96491228 0.96622807 0.95482456        nan 0.96885965 0.96798246
 0.95701754        nan 0.96973684 0.96973684 0.95701754        nan
 0.97105263 0.97017544 0.95701754        nan 0.97105263 0.96929825
 0.95657895        nan 0.97061404 0.96885965 0.95701754        nan
 0.97061404 0.96929825 0.95789474        nan 0.96929825 0.97017544
 0.96008772        nan 0.96929825 0.97105263 0.95964912        nan
 0.96929825 0.96973684]


### Step 4: Evaluating performance

In [10]:
y_pred_class = best_clf.predict(X_test)
print(classification_report(Y_test, y_pred_class))
print('Precision score :',precision_score(Y_test, y_pred_class,average='weighted'))
print('Recall score  :',recall_score(Y_test, y_pred_class,average='weighted'))

                     precision    recall  f1-score   support

                apr       1.00      1.00      1.00        30
         calculator       0.97      0.97      0.97        30
 cancel_reservation       0.97      1.00      0.98        30
           carry_on       1.00      0.97      0.98        30
       credit_score       1.00      1.00      1.00        30
     freeze_account       1.00      1.00      1.00        30
            goodbye       0.97      0.97      0.97        30
    how_old_are_you       0.97      1.00      0.98        30
   ingredients_list       0.89      0.83      0.86        30
              maybe       0.91      0.97      0.94        30
   meeting_schedule       1.00      1.00      1.00        30
          next_song       1.00      1.00      1.00        30
    oil_change_when       1.00      1.00      1.00        30
             recipe       0.84      0.90      0.87        30
             repeat       1.00      0.93      0.97        30
          tell_joke    

### Step 5: Saving model

In [11]:
import pickle
f = open('classifier.sk','wb')
pickle.dump(pipeline,f)
f.close()

### Discussion Questions:
- I choice to use TfidfVectorizer to precess the text input. I think this is a better method compare to CountVectorizer because Tf-idf will weight more on less common words which may be more important for our classification.
- Then I choice logistic regression to do this multi-class classification. Based on my experience with this kind of problems, I think logistic regression is always a good starting model. It is easy and quick to train and use, and it have a good result in most cases.
- The model architecture is simply a Tfidf Vectorizer followed by loglogistic regression classifier. I used cross validation and hyperparameter tuning to improve the performance.
- Since it is a classification problem, I use accuracy, precision, recall, and f1 score to evaluate the model. The weighted average accuracy is 0.97, weighted average precision is 0.9743, and weighted average recall is 0.9737. Therefore the f1 score = 2 * Precision * Recall / Presicion + Recall = 0.9740
- There are potential improvement can be done in the future. I could try more classification models to pick the best model and extend the classifier to more classes and even out-of-scope classes.