In [10]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pprint as pp
import numpy as np

In [2]:
# Load data
emails = fetch_20newsgroups()

# View target names for emails
pp.pprint(emails.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [3]:
# Select desired categories
emails = fetch_20newsgroups(categories=['rec.sport.baseball', 'rec.sport.hockey'])

# View target names for emails
pp.pprint(emails.target_names)

['rec.sport.baseball', 'rec.sport.hockey']


In [4]:
# Preview email in dat
pp.pprint(emails.data[5])

('From: mmb@lamar.ColoState.EDU (Michael Burger)\n'
 'Subject: More TV Info\n'
 'Distribution: na\n'
 'Nntp-Posting-Host: lamar.acns.colostate.edu\n'
 'Organization: Colorado State University, Fort Collins, CO  80523\n'
 'Lines: 36\n'
 '\n'
 'United States Coverage:\n'
 'Sunday April 18\n'
 '  N.J./N.Y.I. at Pittsburgh - 1:00 EDT to Eastern Time Zone\n'
 '  ABC - Gary Thorne and Bill Clement\n'
 '\n'
 '  St. Louis at Chicago - 12:00 CDT and 11:00 MDT - to Central/Mountain '
 'Zones\n'
 '  ABC - Mike Emerick and Jim Schoenfeld\n'
 '\n'
 '  Los Angeles at Calgary - 12:00 PDT and 11:00 ADT - to Pacific/Alaskan '
 'Zones\n'
 '  ABC - Al Michaels and John Davidson\n'
 '\n'
 'Tuesday, April 20\n'
 '  N.J./N.Y.I. at Pittsburgh - 7:30 EDT Nationwide\n'
 '  ESPN - Gary Thorne and Bill Clement\n'
 '\n'
 'Thursday, April 22 and Saturday April 24\n'
 '  To Be Announced - 7:30 EDT Nationwide\n'
 '  ESPN - To Be Announced\n'
 '\n'
 '\n'
 'Canadian Coverage:\n'
 '\n'
 'Sunday, April 18\n'
 '  Buffalo

In [5]:
# Preview email targets

pp.pprint(emails.target[5])
pp.pprint(emails.target_names)

1
['rec.sport.baseball', 'rec.sport.hockey']


In [6]:
def email_tester(categories):

    # Split the data using functions in fetch_20newsgroups object
    train_emails = fetch_20newsgroups(subset='train', shuffle=True, random_state=108, 
                                    categories=categories)
    test_emails = fetch_20newsgroups(subset='test', shuffle=True, random_state=108,
                                    categories=categories)

    # Transform emails into list of words using CountVectorizer
    counter = CountVectorizer()

    # Fit all of the data by concatenating lists
    # The concatenate order does not matter in this case as the counter is
    # building a dictionary of word frequencies across all data that it sees
    counter.fit(test_emails.data + train_emails.data)

    # Make a list of the counts of words in the training set
    train_counts = counter.transform(train_emails.data)

    # Make list of counts of words in the test set
    test_counts = counter.transform(test_emails.data)

    # Create Naive Bayes classifier object
    clf = MultinomialNB()

    # Fit the classifier to the training data and training labels
    clf.fit(train_counts, train_emails.target)

    # Print accuracy of this model
    score = clf.score(test_counts, test_emails.target)

    return score

In [7]:
categories=['rec.sport.baseball', 'rec.sport.hockey']
email_tester(categories)

0.9723618090452262

In [8]:
categories=['alt.atheism','talk.politics.mideast']
email_tester(categories)

0.9482014388489208

In [9]:
# Let's find an the most and least accurate pairing

# Reload data
emails = fetch_20newsgroups()

topics = emails.target_names

# Empty list of accuracy scores
accuracy_scores = []

# list of combinations
combinations = []

# Iterate through all possible combinations and append to list
for i in range(len(topics)):
    for j in range(i + 1, len(topics)):
        if i < len(topics):
            combinations.append([topics[i], topics[j]])

# Iterate through list of combinations and add score to accuracy_scores list
for combo in combinations:
    score = email_tester(combo)
    accuracy_scores.append(score)

In [14]:
# Get indices of max and min scores
max_index = np.argmax(accuracy_scores)
min_index = np.argmin(accuracy_scores)

# Print corresponding pairs from combinations
print(f'{combinations[max_index]} has the highest accuracy score of {accuracy_scores[max_index]}')
print(f'{combinations[min_index]} has the lowest accuracy score of {accuracy_scores[min_index]}')

['comp.sys.ibm.pc.hardware', 'rec.sport.hockey'] has the highest accuracy score of 0.9974715549936789
['comp.os.ms-windows.misc', 'comp.sys.mac.hardware'] has the lowest accuracy score of 0.4980744544287548
