## Multiclass Text Classification (Natural Language Processing)

In [17]:
import pandas as pd ### import pandas
from sklearn.feature_extraction.text import CountVectorizer ### convert text into count vector
from sklearn.feature_extraction.text import TfidfTransformer ### convert count transform data into tfidf form
from nltk.stem import WordNetLemmatizer ### conversion of text into their lemmatize form
import string ### import string for getting punctuation marks
from nltk.corpus import stopwords ### stop word import

#### create some training sentences that needs to store into Dataframe

In [18]:
data_dict = [{
    "sentence":"can you just confirm my ticket booking",
    "intent":"booking" 
    },{
    "sentence":"i want to book a ticket for delhi",
    "intent":"booking"
    },{
    "sentence":"can you look my confirmed booked ticket",
    "intent":"booking"
    },{
    "sentence":"Hi!! how are you doing",
    "intent":"greeting"
    },{
    "sentence":"hope you are doing well",
    "intent":"greeting"
    },{
    "sentence":"Hello buddy",
    "intent":"greeting"
    },{
    "sentence":"how can i perform payment through debit card or credit card?",
    "intent":"payment"
    },{
    "sentence":"Do you charge a separate fee for your gateway?",
    "intent":"payment"
    },{
    "sentence":"What fees do you charge each month?",
    "intent":"payment"
    }
]

In [19]:
classified_df = pd.DataFrame(data_dict) ### create dataframe for existing training sentence

### check the initial rows of data frame by printing out classified_df

In [20]:
classified_df.head()

Unnamed: 0,intent,sentence
0,booking,can you just confirm my ticket booking
1,booking,i want to book a ticket for delhi
2,booking,can you look my confirmed booked ticket
3,greeting,Hi!! how are you doing
4,greeting,hope you are doing well


### Performed preprocessing to remove noise and make uniform for training sentences
#### 1. first lemmatizer object for converting word into root form
#### 2. in function text_process first remove punctuation char and then join again to form sentence
#### 3. finally convert in lower case and remove stop english stopwords from training sentences

In [23]:
lmtzr = WordNetLemmatizer()

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    #return [lmtzr.lemmatize(word) for word in nopunc.split() if word.lower() not in stopwords.words('english')]b
    

In [34]:
# convert the text into bag of word model and pass analyzer text_process function
bow_transformer = CountVectorizer(analyzer=text_process).fit(classified_df['sentence'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

25


#### check the total fitted vocabulary of training data set and printout the length and feature names

In [41]:
print(bow_transformer.get_feature_names())

['Hello', 'Hi', 'book', 'booked', 'booking', 'buddy', 'card', 'charge', 'confirm', 'confirmed', 'credit', 'debit', 'delhi', 'fee', 'fees', 'gateway', 'hope', 'look', 'month', 'payment', 'perform', 'separate', 'ticket', 'want', 'well']


#### transform convert count vectorize numpy array to sparse matrix

In [44]:
sentence_bow = bow_transformer.transform(classified_df['sentence'])

#### check the bag of words for count vectorizer we have 25 columns here and 9 rows as per our training data

In [45]:
sentence_bow.toarray()

array([[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0]])

In [48]:
tfidf_transformer = TfidfTransformer().fit(sentence_bow) ### convert count vectorize bag of words into tfidf vectorize form

##### Below we can see that (n_sample,feature vector) for data points and for response classs we have check the shape

In [49]:
sentence_tfidf = tfidf_transformer.transform(sentence_bow)
print(sentence_tfidf.shape)

(9, 25)


In [55]:
classified_df['intent'].shape

(9,)

### Fitting of the model in different Algos
#### 1. MultinomialNB
#### 2. LogisticRegression
#### 3. LinearSVC

In [56]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

##### fit the sentence_tfidf and corresponding response classes for multinomial naive bayes
detect_model = MultinomialNB().fit(sentence_tfidf, classified_df['intent'])

### Logistic regression algo for fitting the data
detect_model2 = LogisticRegression().fit(sentence_tfidf, classified_df['intent'])

### LinearSVC algo for fitting the data
detect_model3 = LinearSVC().fit(sentence_tfidf, classified_df['intent'])

#### Checked the score for LinearSVC() model for existing training data

In [63]:
detect_model3.score(sentence_tfidf,classified_df['intent']) ## score for existing training data set matched 100 percent

1.0

### Now we need to perfrom train test split approach for checking the accuracy

In [64]:
from sklearn.model_selection import train_test_split ### import train test split from model selection package

### got (X_train,y_train) and (X_test,y_test) data matrix
X_train, X_test, y_train, y_test = train_test_split(classified_df['sentence'], classified_df['intent'], test_size=0.2)

print(len(X_train), len(X_test), len(X_train) + len(X_test))

7 2 9


In [65]:
# transform testing data (using fitted vocabulary) into a document-term matrix# trans 
X_train = bow_transformer.transform(X_train)
X_test_dtm = tfidf_transformer.transform(X_train)
X_test_dtm

<7x25 sparse matrix of type '<class 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>

### print out the confusion matrix

In [68]:
from sklearn.metrics import confusion_matrix

In [69]:
# Predicting the train set results

y_train_pred = detect_model3.predict(X_test_dtm)
print(y_train_pred)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)

['payment' 'booking' 'payment' 'booking' 'greeting' 'booking' 'greeting']
[[3 0 0]
 [0 2 0]
 [0 0 2]]


#### perfrom predict operation on input user query

In [70]:
sentence4 ="can you just confirm my ticket booking" ### 
print(sentence4)
bow4 = bow_transformer.transform([sentence4])
print(bow4.shape)

can you just confirm my ticket booking
(1, 25)


In [71]:

tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 22)	0.4608478760896585
  (0, 8)	0.62754252250491
  (0, 4)	0.62754252250491
