In [1]:
from sklearn import model_selection, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC

import pandas as pd
import numpy as np

import warnings 
warnings.filterwarnings(action = 'ignore') 

In [2]:
# Import pre-cleaned dataset
input_prefix = '20newsgroups_pols_cleaned_nh'

df_train = pd.read_csv(input_prefix + '_train.csv')
df_test = pd.read_csv(input_prefix + '_test.csv')

In [3]:
# Split text back into tokens
X_train = [text.split(' ') for text in df_train.text]
X_test = [text.split(' ') for text in df_test.text]

In [4]:
# Split of targets
y_train = df_train['target']
y_test = df_test['target']

In [5]:
# Set up TFIDF vectorizing model
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,6))
tfidf_vect_ngram.fit(df_train['text'])

# Vectorize text tokens
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(df_train['text'])
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(df_test['text'])

# Get array of feature names (words) from TFIDF Vectorizer
tfidf_feat_names = np.array(tfidf_vect_ngram.get_feature_names())

In [6]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_1 = linear_model.LogisticRegression()

# Linear Logistic Regression on TFIDF Vectors
lr_model_1 = lr_clf_1.fit(X_train_tfidf_ngram, y_train)
lr_preds_1 = lr_model_1.predict(X_test_tfidf_ngram)

lr_accuracy_1 = metrics.accuracy_score(lr_preds_1, y_test)

print('LR Mean Vectors: ', lr_accuracy_1)
metrics.confusion_matrix(y_test, lr_preds_1)

LR Mean Vectors:  0.8324365872405841


array([[348,   2,  12,   2],
       [ 10, 359,   7,   0],
       [106,  10, 192,   2],
       [ 34,  14,  19, 184]], dtype=int64)

In [7]:
# Instantiate SVC Classifier
svc_clf_1 = LinearSVC()

# SVC on TFIDF Vectors
svc_model_1 = svc_clf_1.fit(X_train_tfidf_ngram, y_train)
svc_preds_1 = svc_model_1.predict(X_test_tfidf_ngram)

svc_accuracy_1 = metrics.accuracy_score(svc_preds_1, y_test)

print('SVC Mean Vectors: ', svc_accuracy_1)
metrics.confusion_matrix(y_test, svc_preds_1)

SVC Mean Vectors:  0.8554957724827056


array([[340,   1,  16,   7],
       [ 10, 344,  18,   4],
       [ 85,   4, 213,   8],
       [ 15,   4,  16, 216]], dtype=int64)

In [8]:
# Instantiate Naive Bayes Classifier
nb_clf_1 = naive_bayes.MultinomialNB()

# Naive Bayes  on TFIDF Vectors
nb_model_1 = nb_clf_1.fit(X_train_tfidf_ngram, y_train)
nb_preds_1 = nb_model_1.predict(X_test_tfidf_ngram)

nb_accuracy_1 = metrics.accuracy_score(nb_preds_1, y_test)

print('NB, Count Vectors: ', nb_accuracy_1)
metrics.confusion_matrix(y_test, nb_preds_1)

NB, Count Vectors:  0.7893927747886241


array([[354,   7,   3,   0],
       [  8, 368,   0,   0],
       [113,  17, 180,   0],
       [ 67,  51,   8, 125]], dtype=int64)

In [9]:
# Get a list of the most important words for each category using TFIDF and 
# the SVC model
print('Most important words for category 0 (guns)')
print(tfidf_feat_names[np.argsort(-svc_model_1.coef_[0,:])[0:10]], '\n')

print('Most important words for category 1 (mideast)')
print(tfidf_feat_names[np.argsort(-svc_model_1.coef_[1,:])[0:10]], '\n')

print('Most important words for category 2 (misc politics)')
print(tfidf_feat_names[np.argsort(-svc_model_1.coef_[2,:])[0:10]], '\n')

print('Most important words for category 3 (religion)')
print(tfidf_feat_names[np.argsort(-svc_model_1.coef_[3,:])[0:10]])

Most important words for category 0 (guns)
['gun' 'weapon' 'firearm' 'fire' 'handgun' 'militia' 'fbi' 'batf'
 'criminal' 'nra'] 

Most important words for category 1 (mideast)
['israel' 'israeli' 'jews' 'turkish' 'armenians' 'armenian' 'arab'
 'turkey' 'armenia' 'jewish'] 

Most important words for category 2 (misc politics)
['tax' 'president' 'cramer' 'drug' 'clinton' 'optilink' 'kaldis' 'gay'
 'homosexual' 'job'] 

Most important words for category 3 (religion)
['god' 'jesus' 'christian' 'bible' 'objective' 'morality' 'critus'
 'sandvik' 'christians' 'religion']


In [10]:
# Create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,6))
count_vect.fit(df_train['text'])

# Transform the training and test data using count vectorizer object
X_train_count =  count_vect.transform(df_train['text'])
X_test_count =  count_vect.transform(df_test['text'])

# Get list of feature names (words) from Count Vectorizer
count_feat_names = count_vect.get_feature_names()

In [11]:
# Instantiate Linear Logistic Regression Classifier
lr_clf_2 = linear_model.LogisticRegression()

# Linear Logistic Regression on Count Vectors
lr_model_2 = lr_clf_2.fit(X_train_count, y_train)
lr_preds_2 = lr_model_2.predict(X_test_count)

lr_accuracy_2 = metrics.accuracy_score(lr_preds_2, y_test)

print('LR Mean Vectors: ', lr_accuracy_2)
metrics.confusion_matrix(y_test, lr_preds_2)

LR Mean Vectors:  0.8124519600307456


array([[312,   7,  28,  17],
       [ 11, 329,  26,  10],
       [ 87,   4, 199,  20],
       [ 15,   4,  15, 217]], dtype=int64)

In [12]:
# Instantiate SVC Classifier
svc_clf_2 = LinearSVC()

# SVC on Count Vectors
svc_model_2 = svc_clf_2.fit(X_train_count, y_train)
svc_preds_2 = svc_model_2.predict(X_test_count)

svc_accuracy_2 = metrics.accuracy_score(svc_preds_2, y_test)

print('SVC Mean Vectors: ', svc_accuracy_2)
metrics.confusion_matrix(y_test, svc_preds_2)

SVC Mean Vectors:  0.7632590315142198


array([[287,  11,  30,  36],
       [ 16, 295,  37,  28],
       [ 71,   8, 194,  37],
       [ 12,   5,  17, 217]], dtype=int64)

In [13]:
# Instantiate Naive Bayes Classifier
nb_clf_2 = naive_bayes.MultinomialNB()

# Naive Bayes on Count Vectors
nb_model_2 = nb_clf_2.fit(X_train_count, y_train)
nb_preds_2 = nb_model_2.predict(X_test_count)

nb_accuracy_2 = metrics.accuracy_score(nb_preds_2, y_test)

print('NB, Count Vectors: ', nb_accuracy_2)
metrics.confusion_matrix(y_test, nb_preds_2)

NB, Count Vectors:  0.8608762490392006


array([[337,   5,  16,   6],
       [  3, 361,  12,   0],
       [ 69,  15, 223,   3],
       [ 23,  12,  17, 199]], dtype=int64)

In [14]:
# Get a list of the most important words for each category using BOW and 
# the Naive Bayes model
print('Most important words for category 0 (guns)')
print(tfidf_feat_names[np.argsort(-nb_model_2.coef_[0,:])[0:10]], '\n')

print('Most important words for category 1 (mideast)')
print(tfidf_feat_names[np.argsort(-nb_model_2.coef_[1,:])[0:10]], '\n')

print('Most important words for category 2 (misc politics)')
print(tfidf_feat_names[np.argsort(-nb_model_2.coef_[2,:])[0:10]], '\n')

print('Most important words for category 3 (religion)')
print(tfidf_feat_names[np.argsort(-nb_model_2.coef_[3,:])[0:10]])

Most important words for category 0 (guns)
['pron' 'gun' 'not' 'the' 'be' 'edu' 's' 'in' 'people' 'write'] 

Most important words for category 1 (mideast)
['pron' 'the' 'not' 'be' 'people' 'in' 'say' 'israel' 's' 'write'] 

Most important words for category 2 (misc politics)
['pron' 'be' 'not' 'the' 'in' 's' 'edu' 'people' 'write' 'q'] 

Most important words for category 3 (religion)
['pron' 'not' 'the' 'be' 'write' 'in' 'god' 'people' 's' 'edu']
