In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("shuffled-full-set-hashed.csv", header = None)
df.columns = ['Category', 'Content']
df.shape

(62204, 2)

In [3]:
df['Category'].value_counts()

BILL                       18968
POLICY CHANGE              10627
CANCELLATION NOTICE         9731
BINDER                      8973
DELETION OF INTEREST        4826
REINSTATEMENT NOTICE        4368
DECLARATION                  968
CHANGE ENDORSEMENT           889
RETURNED CHECK               749
EXPIRATION NOTICE            734
NON-RENEWAL NOTICE           624
BILL BINDER                  289
APPLICATION                  229
INTENT TO CANCEL NOTICE      229
Name: Category, dtype: int64

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, max_df = 0.8, min_df = 0.01, norm='l2', ngram_range=(1, 2))

features = tfidf.fit_transform(df['Content'].values.astype('U')).toarray()
labels = df['Category']
features.shape

In [6]:
N = 2
MAX = 10
i = 0
for Product, category_id in sorted(labels.items()):
    if i > MAX:
        break
    i = i + 1
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# '0':
  . Most correlated unigrams:
. 6b223a390d86
. d774c0d219f8
  . Most correlated bigrams:
. ba02159e05b1 892d541c89eb
. 6b223a390d86 6ce6cc5a3203
# '1':
  . Most correlated unigrams:
. 296e1c47e3cc
. 4bfcec3d7413
  . Most correlated bigrams:
. 75199e110c91 ff1c26ea0b6f
. eb562127f33e c0ed51271129
# '2':
  . Most correlated unigrams:
. d19b1c129f40
. 641356219cbc
  . Most correlated bigrams:
. f95d0bea231b 21e314d3afcc
. b9699ce57810 641356219cbc
# '3':
  . Most correlated unigrams:
. d19b1c129f40
. 641356219cbc
  . Most correlated bigrams:
. f95d0bea231b 21e314d3afcc
. b9699ce57810 641356219cbc
# '4':
  . Most correlated unigrams:
. d19b1c129f40
. 641356219cbc
  . Most correlated bigrams:
. f95d0bea231b 21e314d3afcc
. b9699ce57810 641356219cbc
# '5':
  . Most correlated unigrams:
. 73d8f0e46834
. 557ec6c63cf9
  . Most correlated bigrams:
. 8754554be158 d38820625542
. 69e10bcd0d9a c36b062de326
# '6':
  . Most correlated unigrams:
. 73d8f0e46834
. 557ec6c63cf9
  . Most correlated b

In [None]:
models = [
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
        
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])




In [None]:
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()