In [1]:
import pandas as pd

df = pd.read_csv("Customer_Service_Questions_Multiclass.csv")
df.head()

Unnamed: 0,question,topic
0,"Hi! If I sign up for your email list, can I se...",Sales/Promotions
1,I'm going to be out of the country for about a...,Shipping
2,I was wondering if you'd be able to overnight ...,Shipping
3,The Swingline electronic stapler (472555) look...,Shipping
4,I think this cosmetic bag would work great for...,Shipping


In [2]:
# Clean corpus
# import time
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
clean_text = []
# start_time = time.time()
for t in df.question:
    words = regexp_tokenize(t.lower(), r'[A-Za-z]+')
    words = [w for w in words if len(w)>1 and w not in stopwords.words('english')]
    words = [lemmatizer.lemmatize(w) for w in words]
    clean_text.append(' '.join(words))
# print('Elapsed clock time: ', time.time()-start_time, ' s')

df['question'] = clean_text

> Check Duplicates
```python
print(len(df[df.duplicated()==True]))
934
```
> Check Null Values
```python
'''Column with Null value'''
print(df.isnull().any())
question    False
topic       False
dtype: bool
'''Row with Nill value'''
print(len(df[df.isnull().values==True]))
0
```
> Dataset doesn't have Null values, but has duplicate records.

In [3]:
# Drop duplicate records
df = df.drop_duplicates()
print(len(df))
df.head()

4066


Unnamed: 0,question,topic
0,hi sign email list select get email exclusivel...,Sales/Promotions
1,going country week travel going getting animal...,Shipping
2,wondering able overnight jacket item trenton nj,Shipping
3,swingline electronic stapler look really great...,Shipping
4,think cosmetic bag would work great however kn...,Shipping


In [4]:
# Split data into train and validation and encode label
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

train_x, valid_x, train_y, valid_y = \
    train_test_split(df['question'], df['topic'], \
    test_size=0.2, random_state=42, stratify=df['topic'])  # stratified sample

encoder = LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

> View the effect of stratified sampling：
```python
'''Original Data'''
df['topic'].value_counts()
'''output'''
Product Specifications    817
Product Comparison        797
Shipping                  740
Returns & Refunds         625
Product Availability      408
Omnichannel               381
Sales/Promotions          298
Name: topic, dtype: int64
```
```python
'''Stratified sampling'''
print(pd.Series(train_y).value_counts())
print(encoder.classes_)
'''output'''
3    653
2    638
6    592
4    500
1    326
0    305
5    238
dtype: int64
['Omnichannel' 'Product Availability' 'Product Comparison'
 'Product Specifications' 'Returns & Refunds' 'Sales/Promotions'
 'Shipping']
```

In [5]:
# Text vectorization: Create count, TF-IDF, and n-gram vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', \
    max_features=1000)
tfidf_vect.fit(df['question'])
xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)

> Read xtrain_tfidf
```python
print(df.shape)
(4066, 2)
print(xtrain_tfidf.shape) # 80%
(3252, 1000)
print(xvalid_tfidf.shape) # 20%
(814, 1000)
print(len(tfidf_vect.vocabulary_))
1000
print(tfidf_vect.vocabulary_)
{'hi': 383,
 'sign': 793,
 'email': 271,
 'list': 476,
 'select': 765,
 'get': 342,
 ....
 'estimate': 276,
 'wisconsin': 983,
 'sarasota': 750,
 'along': 19,
 'feel': 302,
 'bring': 107}
```

# Model Training -- Logistic Regression

In [6]:
# 3 types of Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import logistic_regression_path
from sklearn.metrics import confusion_matrix, classification_report

def model_evaluate(classifier,tx=xtrain_tfidf,ty=train_y,vx=xvalid_tfidf,vy=valid_y,name=encoder.classes_):
    # Evaluate the classifier
    classifier.fit(tx, ty)
    ty_predict = classifier.predict(tx)
    vy_predict = classifier.predict(vx)
    print("********** Train Set **********")
    print(classification_report(train_y, ty_predict, target_names=name))
    print("********** Valid Set **********")
    print(classification_report(valid_y, vy_predict, target_names=name))

## solver
Algorithm to use in the optimization problem.

- For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
- For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes.
- ‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty
- ‘liblinear’ and ‘saga’ also handle L1 penalty
- ‘saga’ also supports ‘elasticnet’ penalty
- ‘liblinear’ does not support setting penalty='none'
- Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing.

In [7]:
# Default case: multi_class='auto',solver='lbfgs',penalty='l2',max_iter=100
# solver='lbfgs': A kind of quasi-Newton method, 
# iteratively optimizes the loss function by using 
# the second derivative matrix of the loss function, the Hessian matrix.
LR1 = LogisticRegression(multi_class='auto',solver='lbfgs')
model_evaluate(LR1)

********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.98       305
  Product Availability       0.94      0.94      0.94       326
    Product Comparison       0.96      0.96      0.96       638
Product Specifications       0.94      0.96      0.95       653
     Returns & Refunds       1.00      1.00      1.00       500
      Sales/Promotions       0.99      0.97      0.98       238
              Shipping       0.99      1.00      0.99       592

              accuracy                           0.97      3252
             macro avg       0.97      0.97      0.97      3252
          weighted avg       0.97      0.97      0.97      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.99        76
  Product Availability       0.87      0.84      0.86        82
    Product Comparison       0.91   

In [8]:
# solver='liblinear': Coordinate Descent
LR2 = LogisticRegression(multi_class='auto',solver='liblinear')
model_evaluate(LR2)

********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.98       305
  Product Availability       0.93      0.91      0.92       326
    Product Comparison       0.95      0.96      0.95       638
Product Specifications       0.93      0.94      0.94       653
     Returns & Refunds       1.00      1.00      1.00       500
      Sales/Promotions       0.99      0.97      0.98       238
              Shipping       0.99      1.00      0.99       592

              accuracy                           0.97      3252
             macro avg       0.97      0.96      0.97      3252
          weighted avg       0.97      0.97      0.97      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.96      0.98        76
  Product Availability       0.87      0.84      0.86        82
    Product Comparison       0.90   

In [9]:
# solver='newton-cg': It is also a kind of Newtonian family.
LR3 = LogisticRegression(multi_class='auto',solver='newton-cg')
model_evaluate(LR3)

********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.98       305
  Product Availability       0.94      0.94      0.94       326
    Product Comparison       0.96      0.96      0.96       638
Product Specifications       0.94      0.96      0.95       653
     Returns & Refunds       1.00      1.00      1.00       500
      Sales/Promotions       0.99      0.97      0.98       238
              Shipping       0.99      1.00      0.99       592

              accuracy                           0.97      3252
             macro avg       0.97      0.97      0.97      3252
          weighted avg       0.97      0.97      0.97      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.99        76
  Product Availability       0.87      0.84      0.86        82
    Product Comparison       0.91   

In [10]:
# solver='sag': Gradient Descent
LR4 = LogisticRegression(multi_class='auto',solver='newton-cg')
model_evaluate(LR4)

********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.98       305
  Product Availability       0.94      0.94      0.94       326
    Product Comparison       0.96      0.96      0.96       638
Product Specifications       0.94      0.96      0.95       653
     Returns & Refunds       1.00      1.00      1.00       500
      Sales/Promotions       0.99      0.97      0.98       238
              Shipping       0.99      1.00      0.99       592

              accuracy                           0.97      3252
             macro avg       0.97      0.97      0.97      3252
          weighted avg       0.97      0.97      0.97      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.99        76
  Product Availability       0.87      0.84      0.86        82
    Product Comparison       0.91   

> Solver:
> - **'lbfgs'** (default): Quasi-Newton method. Second derivative matrix of the loss function, the Hessian matrix.
> - **'newton-cg'**: Also a kind of Newtonian family. Second derivative matrix of the loss function, the Hessian matrix.
> - **'liblinear'**: Coordinate Descent
> - **'sag'**: Gradient Descent

## penalty

In [11]:
# penalty='l1' ==> solver can only be 'liblinear' (coordinate descent)
# Since the l1 regularization's loss function is not derivable.
# l1 has better effect in reducing overfitting than l2.
# if L2 regularization(default) is still overfitting, that is, 
# when the prediction effect is poor,
# consider L1 regularization.
LR5 = LogisticRegression(multi_class='auto',solver='liblinear',penalty='l1')
model_evaluate(LR5)

********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       0.98      0.96      0.97       305
  Product Availability       0.88      0.89      0.88       326
    Product Comparison       0.96      0.92      0.94       638
Product Specifications       0.89      0.93      0.91       653
     Returns & Refunds       0.99      1.00      0.99       500
      Sales/Promotions       0.99      0.95      0.97       238
              Shipping       0.98      0.99      0.99       592

              accuracy                           0.95      3252
             macro avg       0.95      0.95      0.95      3252
          weighted avg       0.95      0.95      0.95      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.96      0.98        76
  Product Availability       0.81      0.83      0.82        82
    Product Comparison       0.93   

> Penalty:
> - l1: Loss function is not derivable.
> - l2: Loss function is derivable.

## multi_class

- If the option chosen is ‘ovr’, then a binary problem is fit for each label. For ‘multinomial’ the loss minimised is the multinomial loss fit across the entire probability distribution, even when the data is binary. ‘multinomial’ is unavailable when solver=’liblinear’.
- ‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.

In [12]:
LR6 = LogisticRegression(multi_class='multinomial',solver='sag',penalty='l2')
model_evaluate(LR6)

********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.98       305
  Product Availability       0.94      0.94      0.94       326
    Product Comparison       0.96      0.96      0.96       638
Product Specifications       0.94      0.96      0.95       653
     Returns & Refunds       1.00      1.00      1.00       500
      Sales/Promotions       0.99      0.97      0.98       238
              Shipping       0.99      1.00      0.99       592

              accuracy                           0.97      3252
             macro avg       0.97      0.97      0.97      3252
          weighted avg       0.97      0.97      0.97      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.99        76
  Product Availability       0.87      0.84      0.86        82
    Product Comparison       0.91   

> multi_class:
> - 'ovr' ==> solver = liblinear，newton-cg, lbfgs or sag
> - 'multinomial' ==> solver = newton-cg, lbfgs or sag

> penalty='l1' ==> solver='liblinear' ==> multi_class='ovr'

In [13]:
LR6

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

## class_weight

Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.

The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).

Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.

In [14]:
LR7 = LogisticRegression(class_weight='balanced',multi_class='auto',solver='lbfgs')
model_evaluate(LR7)

********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.99      0.99       305
  Product Availability       0.88      0.97      0.93       326
    Product Comparison       0.97      0.94      0.95       638
Product Specifications       0.95      0.93      0.94       653
     Returns & Refunds       1.00      1.00      1.00       500
      Sales/Promotions       0.98      0.99      0.99       238
              Shipping       1.00      0.99      1.00       592

              accuracy                           0.97      3252
             macro avg       0.97      0.97      0.97      3252
          weighted avg       0.97      0.97      0.97      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.99      0.99        76
  Product Availability       0.82      0.91      0.86        82
    Product Comparison       0.91   

In [15]:
# solver = 'saga'
LR8 = LogisticRegression(class_weight='balanced',
                         multi_class='auto',
                         solver='saga',
                         penalty='elasticnet',
                         l1_ratio=0.5)
model_evaluate(LR8)

********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.99       305
  Product Availability       0.83      0.97      0.90       326
    Product Comparison       0.95      0.91      0.93       638
Product Specifications       0.94      0.91      0.92       653
     Returns & Refunds       1.00      1.00      1.00       500
      Sales/Promotions       0.98      0.98      0.98       238
              Shipping       0.99      0.99      0.99       592

              accuracy                           0.96      3252
             macro avg       0.96      0.96      0.96      3252
          weighted avg       0.96      0.96      0.96      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      0.97      0.99        76
  Product Availability       0.76      0.90      0.83        82
    Product Comparison       0.92   

In [16]:
# LogisticRegressionCV
# Logistic regression with built-in cross validation.
LR9 = LogisticRegressionCV(multi_class='auto',solver='liblinear',penalty='l1')
model_evaluate(LR9)



********** Train Set **********
                        precision    recall  f1-score   support

           Omnichannel       1.00      1.00      1.00       305
  Product Availability       1.00      0.99      1.00       326
    Product Comparison       1.00      1.00      1.00       638
Product Specifications       1.00      1.00      1.00       653
     Returns & Refunds       1.00      1.00      1.00       500
      Sales/Promotions       1.00      1.00      1.00       238
              Shipping       1.00      1.00      1.00       592

              accuracy                           1.00      3252
             macro avg       1.00      1.00      1.00      3252
          weighted avg       1.00      1.00      1.00      3252

********** Valid Set **********
                        precision    recall  f1-score   support

           Omnichannel       0.99      1.00      0.99        76
  Product Availability       0.88      0.83      0.86        82
    Product Comparison       0.92   