# Text Classificaton using Machine Learning

## Imports

In [21]:
import pandas as pd
from pandas import Series, DataFrame
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import sklearn
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# Data-Set Prep

In [48]:
df_requests = pd.read_excel('frc_data.xlsx', sheet_name='frc data')

In [89]:
df_requests['Clean Description'] = df_requests['Request Description'].str.replace('\n', '')

### Condense LOB Categories

In [91]:
monoline_list = ['Equipment Breakdown', 'Inland Marine', 'International', 'Ocean Marine']
package_general_liability_list = ['Connect CNP', 'General Liability', 'Multiline']
package_property_list = ['Paramount Package', 'Property']
condensed_lob_list = []
for row in df_requests['LOB']:
    if row in monoline_list:
        condensed_lob_list.append('Monoline')
    elif row in package_general_liability_list:
        condensed_lob_list.append('Package General Liability')
    elif row in package_property_list:
        condensed_lob_list.append('Package Property')
    else:
        condensed_lob_list.append(row)
        
df_requests['Updated LOB'] = condensed_lob_list

In [92]:
df_requests['Updated LOB'].value_counts()

Package General Liability    2407
Auto                         1948
Workers Comp                 1665
Package Property             1438
Umbrella                      878
Not LOB Specific              561
Monoline                      416
Name: Updated LOB, dtype: int64

### Reduce Keywords

In [94]:
key_word_list = ['Rate Change',
'Form Review',
'Endorsement Print',
'Premium Discrepancy',
'CAT',
'Endorsement Process',
'Experience Modification',
'Billing',
'System Issue']
condensed_list = []
for row in df_requests['Key Word Roll-Up']:
    if row in key_word_list:
        condensed_list.append(row)
    else: 
        condensed_list.append('Other')
        
df_requests['key_words'] = condensed_list

## Logistic Regression

In [79]:
import seaborn as sb
from pylab import rcParams
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics 
from sklearn.metrics import classification_report

In [80]:
%matplotlib inline
rcParams['figure.figsize'] = 10, 8
sb.set_style('whitegrid')
df_requests = pd.read_excel('frc_data.xlsx', sheet_name='frc data')

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_requests['Request Description'])
X.shape

(9313, 14307)

In [81]:
action = pd.get_dummies(df_requests['Action Needed'],drop_first=True)
action.Y.value_counts()

X_train, X_test, y_train, y_test = train_test_split(X, action.Y, random_state=0)
X_train.shape

(6984, 14307)

In [82]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [83]:
classifier.score(X_test, y_test)
y_pred = classifier.predict(X_test)

confusion_matrix(y_pred, y_test)

array([[1680,  266],
       [ 163,  220]])

In [84]:
classifier.score(X_test, y_test)

0.8158007728638901

In [85]:
from collections import Counter
Counter(y_test)

Counter({0: 1843, 1: 486})

In [86]:
1843 + 486

2329

In [87]:
1843 / 2329

0.7913267496779733

# Keras Text Classification - LOB

In [39]:
train_size = int(len(df_requests) * .8)
train_posts = df_requests['Clean Description'][:train_size]
train_tags = df_requests['Updated LOB'][:train_size]
test_posts = df_requests['Clean Description'][train_size:]
test_tags = df_requests['Updated LOB'][train_size:]

vocab_size = 1000
tokenize = text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(train_posts)

x_train = tokenize.texts_to_matrix(train_posts)

encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

model = Sequential()

model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))

model.add(Dense(y_train.shape[1]))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

history = model.fit(x_train, y_train, 
                    batch_size=100, 
                    epochs=6, 
                    verbose=1, 
                    validation_split=0.1)

Train on 6705 samples, validate on 745 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [40]:
predictions = model.predict(x_train)
np.argmax(predictions[0])
predicted_labels = [np.argmax(x) for x in predictions]
y_labels = [np.argmax(x) for x in y_train]
confusion_matrix(y_labels, predicted_labels)

array([[1522,    2,    7,   47,   20,    6,   26],
       [   8,  246,    3,   19,   15,    3,    3],
       [  23,    3,  400,   66,   24,    9,   16],
       [  41,    2,   15, 1654,   72,   15,   36],
       [  29,    4,    7,   61, 1063,    4,    7],
       [  18,    2,    4,   36,   10,  566,   13],
       [  22,    2,    5,   32,   15,   13, 1234]])

# SKLearn Count Vectorizer N Grams - LOB

### Imports

In [41]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/ZGS/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
vectorizer = CountVectorizer(stop_words=sw,ngram_range=(1,2))
train_posts = vectorizer.fit_transform(df_requests['Clean Description'].values.astype('U'))
train_posts.shape
#x = v.fit_transform(df['Review'].values.astype('U'))

(9313, 163488)

In [43]:
train_size = int(len(df_requests) * .8)
train_posts = df_requests['Clean Description']#[:train_size]
train_tags = df_requests['Updated LOB']#[:train_size]
test_posts = df_requests['Clean Description']#[train_size:]
test_tags = df_requests['Updated LOB']#[train_size:]
train_tags.shape

(9313,)

In [44]:
vocab_size = 4000
tokenize = text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(train_posts)

x_train = tokenize.texts_to_matrix(train_posts)
#print(x_train.shape)

print(x_train[0])
encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
#y_train.shape

model = Sequential()
model.add(Dense(256, input_shape=(x_train.shape[1],)))
#model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(y_train.shape[1]))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

history = model.fit(x_train, y_train, 
                    batch_size=100, 
                    epochs=4, 
                    verbose=1, 
                    validation_split=0.1)

[0. 0. 0. ... 0. 0. 0.]
Train on 8381 samples, validate on 932 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


# Facebook FastText Classification

In [None]:
__label__Auto Today I watched the Commercial Auto EPC Training Webinar 5, Other Transactions/Auto Tips & Tricks.  The trainer mentioned that the NBRI document has been updated with the separation of the Comprehensive and Collision losses which I can see.When setting up an account today for new business rating (effective 08/01/18), I noticed that the Loss Summary tab on the NBRI doesnot require the following information:1.  Separate entry of lossexpenses2.  Each loss date3.  Carrier Name4.  Policy NumberAre these requirements obsolete for EPC Automobile Experience Rating?  Please advise.  Thanks.
__label__Package-General-Liability Do we have a version of the sexual abuse and molestation questionnaire that is not catered to education risks? I have a copy of the EDU version but don't want to confuse my insured.Thanks,
__label__Package-General-Liability ​On a GL Composite Rated policy (Paramount Package), the user added forms CG 20 37 and CNA-74745 which was supposed to generate a 5% premium but did not.  The policy was already composite rated so will he have to remove the composite rate then add the form?
__label__Workers-Comp ​Please review this one, why did we add the officers back on? Are their errors to correct?
__label__Workers-Comp ​RE factors not available for WC policies - due to Prior Carrier data issue.  Please report Ticket to Help Desk.  Can you look into this, please?  Thanks. 
__label__Not-LOB-Specific ​how do I update my UW remarks?
__label__Auto ​I've converted policy once this am with one composite rate regardless of state or type.  Rated fine – no errors UW has ask that I composite rate per vehicle type and all the states carry the same rate (per type).I've created and assigned groups but now I'm getting the following error:  Aug 21, 20188:58 AMErrors during initial rate of 6072603997 - Submission​The following errors were found when rating the 6072603997 - Submission:Policy has a composite rate that does not allow Michigan to rate both PIP and coverage. Evaluate the composite rate or create a group composite for all states separating Michigan (EBR209171).​ Any suggestions?
__label__Package-General-Liability ​Please expedite so we can quote.
Zacharys-iMac:fastText-0.1.0 ZGS$ clear

Zacharys-iMac:fastText-0.1.0 ZGS$ wc combined_text.txt
    9313  513357 3109561 combined_text.txt
Zacharys-iMac:fastText-0.1.0 ZGS$ head -7313 combined_text.txt > data.train
Zacharys-iMac:fastText-0.1.0 ZGS$ tail -2000 combined_text.txt > data.valid
Zacharys-iMac:fastText-0.1.0 ZGS$ ./fasttext supervised -input data.train -output model_project -lr 1.0 -epoch 50
Read 0M words
Number of words:  38584
Number of labels: 7
Progress: 100.0%  words/sec/thread: 2776697  lr: 0.000000  loss: 0.179360  eta: 0h0m -14m 
Zacharys-iMac:fastText-0.1.0 ZGS$ ./fasttext test model_project.bin data.valid 
N	2000
P@1	0.585
R@1	0.585
Number of examples: 2000
Zacharys-iMac:fastText-0.1.0 ZGS$ ./fasttext supervised -input data.train -output model_project -lr 1.0 -epoch 50 -wordNgrams 2
Read 0M words
Number of words:  38584
Number of labels: 7
Progress: 100.0%  words/sec/thread: 1403889  lr: 0.000000  loss: 0.062241  eta: 0h0m 
Zacharys-iMac:fastText-0.1.0 ZGS$ ./fasttext test model_project.bin data.valid 
N	2000
P@1	0.577
R@1	0.577
Number of examples: 2000
Zacharys-iMac:fastText-0.1.0 ZGS$ 

# Keras Text Classification - Keywords

In [None]:
train_size = int(len(df_requests) * .8)
train_posts = df_requests['Clean Description'][:train_size]
train_tags = df_requests['key_words'][:train_size]
test_posts = df_requests['Clean Description'][train_size:]
test_tags = df_requests['key_words'][train_size:]

vocab_size = 1000
tokenize = text.Tokenizer(num_words=vocab_size)
tokenize.fit_on_texts(train_posts)

x_train = tokenize.texts_to_matrix(train_posts)

encoder = LabelBinarizer()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

model = Sequential()

model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))

model.add(Dense(y_train.shape[1]))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

history = model.fit(x_train, y_train, 
                    batch_size=100, 
                    epochs=6, 
                    verbose=1, 
                    validation_split=0.1)

# Clustering Exploration

### Imports

In [66]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning, module='.*/IPython/.*')
warnings.filterwarnings('ignore', category=DeprecationWarning, module='pyLDAvis')

import pyLDAvis
import pyLDAvis.sklearn

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

pyLDAvis.enable_notebook()

#df_re = pd.read_excel('frc_data.xlsx', sheet_name='frc data')

text = df_requests['Clean Description'].values.tolist()

In [67]:
max_features = 1000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=max_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(text)

In [68]:
n_topics = 6

lda_model = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      random_state=123)

lda_model.fit(tf)
pyLDAvis.sklearn.prepare(lda_model,tf, tf_vectorizer, R=20)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [69]:
def get_top_words(model, feature_names, n_top_words):
    top_words = {}
    for topic_idx, topic in enumerate(model.components_):
        _top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        top_words[str(topic_idx)] = _top_words
    return(top_words)

In [70]:
## get the token to topic matrix
word_topic = np.zeros((max_features,n_topics),)
print(n_topics)
lda_model.components_
for topic_idx, topic in enumerate(lda_model.components_):
    word_topic[:,topic_idx] = topic

print("token-topic matrix",word_topic.shape)

## create a matrix of the top words used to define each topic
top_words = 15
tf_feature_names = np.array(tf_vectorizer.get_feature_names())
top_words = get_top_words(lda_model,tf_feature_names,top_words)
all_top_words = np.array(list(set().union(*[v for v in top_words.values()])))

for key,vals in top_words.items():
    print(key," ".join(vals))
print("total words: %s"%len(all_top_words))

top_word_inds = [np.where(tf_feature_names == tw)[0][0] for tw in all_top_words]

6
token-topic matrix (1000, 6)
0 account cna thanks hi frc team thank new good agent help email need able morning
1 policy 18 epc 2018 term renewal account loss 17 new date effective rst number policies
2 endorsement policy insured agent attached state location advise coverage wc processed endorsements number request named
3 form coverage policy add insured auto forms question need does use know thanks hi provide
4 rate premium change showing auto year property renewal rst expiring 000 account rating class mod
5 policy help gl issue error need rst rating know quote let account field center thanks
total words: 67
