# Training Classifiers
### Procedures:
- Add your collected dataset to variable [your_org]_dataset
- Split your dataset by article in to tuples of (article, 'news_org') and append to docs
- Tokenize words, then append them to all_words
- In the Visualization cell, add your news org to the classes list

In [12]:
import nltk
import random 
import numpy
import scipy
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier

#############################################
# ADD YOUR DATASET HERE
cnn_dataset = open('./Datasets/clean_cnn_dataset', 'r').read()
bbc_dataset = open('./Datasets/clean_bbc_dataset', 'r').read()
nyt_dataset = open('./Datasets/clean_nyt_dataset', 'r').read()
fox_dataset = open('./Datasets/clean_fox_dataset', 'r').read()
#alj_dataset = open('./Datasets/aljDataset', 'r').read()

#############################################

#def clean_dataset(dataset):
#    cleanset = []
#    for word in dataset:
#        if word not in stopwords:
#            cleanset.append(word)
#    return cleanset

#cnn_cleanset = clean_dataset(cnn_dataset)
#bbc_cleanset = clean_dataset(bbc_dataset)

docs = []
all_words = []

#################################################
# APPEND TUPLES TO DOC
for r in cnn_dataset.split('\n'):
    docs.append((r, 'cnn'))
    
for r in bbc_dataset.split('\n'):
    docs.append((r, 'bbc'))
    
for r in nyt_dataset.split('\n'):
    docs.append((r, 'nyt'))
    
for r in fox_dataset.split('\n'):
    docs.append((r, 'fox'))
    
#for r in alj_dataset.split('\n'):
 #   docs.append((r, 'alj'))
    


# TOKENIZE    
cnn_words = nltk.word_tokenize(cnn_dataset)
bbc_words = nltk.word_tokenize(bbc_dataset)
nyt_words = nltk.word_tokenize(nyt_dataset)
fox_words = nltk.word_tokenize(fox_dataset)
#alj_words = nltk.word_tokenize(alj_dataset)

# APPEND LOWERCASE WORDS TO ALL WORDS
for w in cnn_words:
    all_words.append(w.lower())
    
for w in bbc_words:
    all_words.append(w.lower())

for w in nyt_words:
    all_words.append(w.lower())
    
for w in fox_words:
    all_words.append(w.lower())
    
#for w in alj_words:
#    all_words.append(w.lower())
    
###################################################    
    
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())

def find_features(doc):
    words = nltk.word_tokenize(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

featuresets = [(find_features(art), org) for (art, org) in docs]
random.shuffle(featuresets)

# CHANGE SIZE OF TRAINING AND TESTING SETS HOW YOU SEE FIT
print('Length of featuresets:', len(featuresets))
train_set, test_set = featuresets[200:], featuresets[:200]

Length of featuresets: 1021


## Naive Bayes Classifier

In [13]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("NB Acc:", nltk.classify.accuracy(classifier, test_set) * 100)
classifier.show_most_informative_features(25)

NB Acc: 65.5
Most Informative Features
                     cnn = True              cnn : bbc    =    128.8 : 1.0
                       — = True              nyt : bbc    =    123.8 : 1.0
                       t = True              nyt : bbc    =    108.6 : 1.0
                 unfolds = True              cnn : nyt    =     88.2 : 1.0
                     mr. = True              nyt : bbc    =     86.5 : 1.0
                       ” = True              nyt : bbc    =     85.7 : 1.0
                       . = False             fox : nyt    =     83.0 : 1.0
                      et = True              cnn : fox    =     80.3 : 1.0
                       - = True              bbc : nyt    =     76.1 : 1.0
                 updated = True              cnn : fox    =     74.5 : 1.0
                      re = True              nyt : bbc    =     64.9 : 1.0
                       “ = True              nyt : bbc    =     61.2 : 1.0
                     don = True              nyt : bbc    =  

## Stochastic Gradient Descent Classifier

In [14]:
SGD_classifier = SklearnClassifier(SGDClassifier())
SGD_classifier.train(train_set)
print("SGD Acc:", nltk.classify.accuracy(SGD_classifier, test_set) * 100)


SGD Acc: 95.5


## Visualization

In [17]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools
import numpy as np

#print('Size of test_set', len(test_set))
#for test in test_set:
#    if(test[1] == 'fox'):
#        print(test[0])
        
ref = [org for (art, org) in test_set]
test = [SGD_classifier.classify(art) for (art, org) in test_set]

cm = nltk.ConfusionMatrix(ref, test)
print(cm.pretty_format(sort_by_count=True, show_percents=True, truncate=9))


'''
sklcm = confusion_matrix(ref, test)
print(sklcm)
np.set_printoptions(precision=2)

def plot_cm(cm, classes, title, cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    plt.tight_layout()
    plt.ylabel('True Class')
    plt.xlabel('Predicted Class')
    
plt.figure()
plot_cm(sklcm, classes=['bbc', 'nyt', 'cnn'], title='SGD Classifier')
plt.show()
'''

    |      b      n      f      c |
    |      b      y      o      n |
    |      c      t      x      n |
----+-----------------------------+
bbc | <32.5%>     .   0.5%   0.5% |
nyt |      . <28.0%>  1.0%      . |
fox |      .   2.0% <23.0%>  0.5% |
cnn |      .      .      . <12.0%>|
----+-----------------------------+
(row = reference; col = test)



"\nsklcm = confusion_matrix(ref, test)\nprint(sklcm)\nnp.set_printoptions(precision=2)\n\ndef plot_cm(cm, classes, title, cmap=plt.cm.Blues):\n    plt.imshow(cm, interpolation='nearest', cmap=cmap)\n    plt.title(title)\n    plt.colorbar()\n    tick_marks = np.arange(len(classes))\n    plt.xticks(tick_marks, classes, rotation=45)\n    plt.yticks(tick_marks, classes)\n    plt.tight_layout()\n    plt.ylabel('True Class')\n    plt.xlabel('Predicted Class')\n    \nplt.figure()\nplot_cm(sklcm, classes=['bbc', 'nyt', 'cnn'], title='SGD Classifier')\nplt.show()\n"

## Save Classifiers

In [16]:
# Save Classifiers
import pickle

pickle.dump(classifier, open('./Classifiers/classifier.pickle', 'wb'))
pickle.dump(SGD_classifier, open('./Classifiers/sgd_classifier.pickle', 'wb'))