In [45]:
##Our import: 
import nltk
from nltk.probability import ConditionalFreqDist
import pandas as pd
import math
from nltk import word_tokenize
import numpy as np

from sklearn.feature_extraction import DictVectorizer

##Our defines: 
q1Verbose=1
q113_verbose=0

### Q1. Document Classification
#### Q1.1. Reuters Dataset

##### Q1.1.1 Turn the code of the Sklearn tutorial above into a notebook.

This code is taken from the out of core classification guide given in the assigmnent. 
http://scikit-learn.org/dev/auto_examples/applications/plot_out_of_core_classification.html#example-applications-plot-out-of-core-classification-py


In [1]:
# Authors: Eustache Diemert <eustache@diemert.fr>
#          @FedericoV <https://github.com/FedericoV/>
# License: BSD 3 clause

from __future__ import print_function

from glob import glob
import itertools
import os.path
import re
import tarfile
import time

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams

from sklearn.externals.six.moves import html_parser
from sklearn.externals.six.moves import urllib
from sklearn.datasets import get_data_home
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB




def _not_in_sphinx():
    # Hack to detect whether we are running by the sphinx builder
    return '__file__' in globals()

%matplotlib inline
###############################################################################
# Reuters Dataset related routines
###############################################################################


class ReutersParser(html_parser.HTMLParser):
    """Utility class to parse a SGML file and yield documents one at a time."""

    def __init__(self, encoding='latin-1'):
        html_parser.HTMLParser.__init__(self)
        self._reset()
        self.encoding = encoding

    def handle_starttag(self, tag, attrs):
        method = 'start_' + tag
        getattr(self, method, lambda x: None)(attrs)

    def handle_endtag(self, tag):
        method = 'end_' + tag
        getattr(self, method, lambda: None)()

    def _reset(self):
        self.in_title = 0
        self.in_body = 0
        self.in_topics = 0
        self.in_topic_d = 0
        self.title = ""
        self.body = ""
        self.topics = []
        self.topic_d = ""

    def parse(self, fd):
        self.docs = []
        for chunk in fd:
            self.feed(chunk.decode(self.encoding))
            for doc in self.docs:
                yield doc
            self.docs = []
        self.close()

    def handle_data(self, data):
        if self.in_body:
            self.body += data
        elif self.in_title:
            self.title += data
        elif self.in_topic_d:
            self.topic_d += data

    def start_reuters(self, attributes):
        pass

    def end_reuters(self):
        self.body = re.sub(r'\s+', r' ', self.body)
        self.docs.append({'title': self.title,
                          'body': self.body,
                          'topics': self.topics})
        self._reset()

    def start_title(self, attributes):
        self.in_title = 1

    def end_title(self):
        self.in_title = 0

    def start_body(self, attributes):
        self.in_body = 1

    def end_body(self):
        self.in_body = 0

    def start_topics(self, attributes):
        self.in_topics = 1

    def end_topics(self):
        self.in_topics = 0

    def start_d(self, attributes):
        self.in_topic_d = 1

    def end_d(self):
        self.in_topic_d = 0
        self.topics.append(self.topic_d)
        self.topic_d = ""


def stream_reuters_documents(data_path=None):
    """Iterate over documents of the Reuters dataset.

    The Reuters archive will automatically be downloaded and uncompressed if
    the `data_path` directory does not exist.

    Documents are represented as dictionaries with 'body' (str),
    'title' (str), 'topics' (list(str)) keys.

    """

    DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
                    'reuters21578-mld/reuters21578.tar.gz')
    ARCHIVE_FILENAME = 'reuters21578.tar.gz'

    if data_path is None:
        data_path = os.path.join(get_data_home(), "reuters")
    if not os.path.exists(data_path):
        """Download the dataset."""
        print("downloading dataset (once and for all) into %s" %
              data_path)
        os.mkdir(data_path)

        def progress(blocknum, bs, size):
            total_sz_mb = '%.2f MB' % (size / 1e6)
            current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
            if _not_in_sphinx():
                print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb),
                      end='')

        archive_path = os.path.join(data_path, ARCHIVE_FILENAME)
        urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
                                   reporthook=progress)
        if _not_in_sphinx():
            print('\r', end='')
        print("untarring Reuters dataset...")
        tarfile.open(archive_path, 'r:gz').extractall(data_path)
        print("done.")

    parser = ReutersParser()
    for filename in glob(os.path.join(data_path, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            yield doc




In [None]:
###############################################################################
# Main
###############################################################################
# Create the vectorizer and limit the number of features to a reasonable
# maximum
vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18,
                               non_negative=True)


# Iterator over parsed Reuters SGML files.
data_stream = stream_reuters_documents()

# We learn a binary classification between the "acq" class and all the others.
# "acq" was chosen as it is more or less evenly distributed in the Reuters
# files. For other datasets, one should take care of creating a test set with
# a realistic portion of positive instances.
all_classes = np.array([0, 1])
positive_class = 'acq'

# Here are some classifiers that support the `partial_fit` method
partial_fit_classifiers = {
    'SGD': SGDClassifier(),
    'Perceptron': Perceptron(),
    'NB Multinomial': MultinomialNB(alpha=0.01),
    'Passive-Aggressive': PassiveAggressiveClassifier(),
}


def get_minibatch(doc_iter, size, pos_class=positive_class):
    """Extract a minibatch of examples, return a tuple X_text, y.

    Note: size is before excluding invalid docs with no topics assigned.

    """
    data = [(u'{title}\n\n{body}'.format(**doc), pos_class in doc['topics'])
            for doc in itertools.islice(doc_iter, size)
            if doc['topics']]
    if not len(data):
        return np.asarray([], dtype=int), np.asarray([], dtype=int)
    X_text, y = zip(*data)
    return X_text, np.asarray(y, dtype=int)


def iter_minibatches(doc_iter, minibatch_size):
    """Generator of minibatches."""
    X_text, y = get_minibatch(doc_iter, minibatch_size)
    while len(X_text):
        yield X_text, y
        X_text, y = get_minibatch(doc_iter, minibatch_size)


# test data statistics
test_stats = {'n_test': 0, 'n_test_pos': 0}

# First we hold out a number of examples to estimate accuracy
n_test_documents = 1000
tick = time.time()
X_test_text, y_test = get_minibatch(data_stream, 1000)
parsing_time = time.time() - tick
tick = time.time()
X_test = vectorizer.transform(X_test_text)
vectorizing_time = time.time() - tick
test_stats['n_test'] += len(y_test)
test_stats['n_test_pos'] += sum(y_test)
print("Test set is %d documents (%d positive)" % (len(y_test), sum(y_test)))


def progress(cls_name, stats):
    """Report progress information, return a string."""
    duration = time.time() - stats['t0']
    s = "%20s classifier : \t" % cls_name
    s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
    s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats
    s += "accuracy: %(accuracy).3f " % stats
    s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
    return s


cls_stats = {}

for cls_name in partial_fit_classifiers:
    stats = {'n_train': 0, 'n_train_pos': 0,
             'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),
             'runtime_history': [(0, 0)], 'total_fit_time': 0.0}
    cls_stats[cls_name] = stats

get_minibatch(data_stream, n_test_documents)
# Discard test set

# We will feed the classifier with mini-batches of 1000 documents; this means
# we have at most 1000 docs in memory at any time.  The smaller the document
# batch, the bigger the relative overhead of the partial fit methods.
minibatch_size = 1000

# Create the data_stream that parses Reuters SGML files and iterates on
# documents as a stream.
minibatch_iterators = iter_minibatches(data_stream, minibatch_size)
total_vect_time = 0.0

# Main loop : iterate on mini-batchs of examples
for i, (X_train_text, y_train) in enumerate(minibatch_iterators):

    tick = time.time()
    X_train = vectorizer.transform(X_train_text)
    total_vect_time += time.time() - tick

    for cls_name, cls in partial_fit_classifiers.items():
        tick = time.time()
        # update estimator with examples in the current mini-batch
        cls.partial_fit(X_train, y_train, classes=all_classes)

        # accumulate test accuracy stats
        cls_stats[cls_name]['total_fit_time'] += time.time() - tick
        cls_stats[cls_name]['n_train'] += X_train.shape[0]
        cls_stats[cls_name]['n_train_pos'] += sum(y_train)
        tick = time.time()
        cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)
        cls_stats[cls_name]['prediction_time'] = time.time() - tick
        acc_history = (cls_stats[cls_name]['accuracy'],
                       cls_stats[cls_name]['n_train'])
        cls_stats[cls_name]['accuracy_history'].append(acc_history)
        run_history = (cls_stats[cls_name]['accuracy'],
                       total_vect_time + cls_stats[cls_name]['total_fit_time'])
        cls_stats[cls_name]['runtime_history'].append(run_history)

        if i % 3 == 0:
            print(progress(cls_name, cls_stats[cls_name]))
    if i % 3 == 0:
        print('\n')


###############################################################################
# Plot results
###############################################################################


def plot_accuracy(x, y, x_legend):
    """Plot accuracy as a function of x."""
    x = np.array(x)
    y = np.array(y)
    plt.title('Classification accuracy as a function of %s' % x_legend)
    plt.xlabel('%s' % x_legend)
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.plot(x, y)

rcParams['legend.fontsize'] = 10
cls_names = list(sorted(cls_stats.keys()))

# Plot accuracy evolution
plt.figure()
for _, stats in sorted(cls_stats.items()):
    # Plot accuracy evolution with #examples
    accuracy, n_examples = zip(*stats['accuracy_history'])
    plot_accuracy(n_examples, accuracy, "training examples (#)")
    ax = plt.gca()
    ax.set_ylim((0.8, 1))
plt.legend(cls_names, loc='best')

plt.figure()
for _, stats in sorted(cls_stats.items()):
    # Plot accuracy evolution with runtime
    accuracy, runtime = zip(*stats['runtime_history'])
    plot_accuracy(runtime, accuracy, 'runtime (s)')
    ax = plt.gca()
    ax.set_ylim((0.8, 1))
plt.legend(cls_names, loc='best')

# Plot fitting times
plt.figure()
fig = plt.gcf()
cls_runtime = []
for cls_name, stats in sorted(cls_stats.items()):
    cls_runtime.append(stats['total_fit_time'])

cls_runtime.append(total_vect_time)
cls_names.append('Vectorization')
bar_colors = rcParams['axes.color_cycle'][:len(cls_names)]

ax = plt.subplot(111)
rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5,
                     color=bar_colors)

ax.set_xticks(np.linspace(0.25, len(cls_names) - 0.75, len(cls_names)))
ax.set_xticklabels(cls_names, fontsize=10)
ymax = max(cls_runtime) * 1.2
ax.set_ylim((0, ymax))
ax.set_ylabel('runtime (s)')
ax.set_title('Training Times')


def autolabel(rectangles):
    """attach some text vi autolabel on rectangles."""
    for rect in rectangles:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() / 2.,
                1.05 * height, '%.4f' % height,
                ha='center', va='bottom')

autolabel(rectangles)
plt.show()

# Plot prediction times
plt.figure()
#fig = plt.gcf()
cls_runtime = []
cls_names = list(sorted(cls_stats.keys()))
for cls_name, stats in sorted(cls_stats.items()):
    cls_runtime.append(stats['prediction_time'])
cls_runtime.append(parsing_time)
cls_names.append('Read/Parse\n+Feat.Extr.')
cls_runtime.append(vectorizing_time)
cls_names.append('Hashing\n+Vect.')
bar_colors = rcParams['axes.color_cycle'][:len(cls_names)]

ax = plt.subplot(111)
rectangles = plt.bar(range(len(cls_names)), cls_runtime, width=0.5,
                     color=bar_colors)

ax.set_xticks(np.linspace(0.25, len(cls_names) - 0.75, len(cls_names)))
ax.set_xticklabels(cls_names, fontsize=8)
plt.setp(plt.xticks()[1], rotation=30)
ymax = max(cls_runtime) * 1.2
ax.set_ylim((0, ymax))
ax.set_ylabel('runtime (s)')
ax.set_title('Prediction Times (%d instances)' % n_test_documents)
autolabel(rectangles)
plt.show()

##### Q1.1.2 Explore how many documents are in the dataset, how many categories, how many documents per categories, provide mean and standard deviation, min and max. (Hint: use the pandas library to explore the dataset, use the dataframe.describe() method.)


In [5]:
#From the code, accessing the reuters document data base. 
data_stream = stream_reuters_documents()
#Experimenting with functions given in hint. 
df = pd.DataFrame(data_stream)
print("The type of df is: ", type(df))
df

The type of df is:  <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,body,title,topics
0,Chrysler Corp said car sales for the March 21-...,CHRYSLER <C> LATE MARCH U.S. CAR SALES UP,[usa]
1,"Compaq Computer Corp, IBM's chief rival in the...",WALL STREET STOCKS/COMPAQ COMPUTER <CPQ>,[usa]
2,<Noranda Inc> said production will remain shut...,NORANDA SETS TEMPORARY MINE SHUTDOWN,"[copper, canada]"
3,The Canadian government's budget deficit rose ...,CANADA BUDGET DEFICIT RISES IN JANUARY,[canada]
4,CIS Technologies Inc said it executed a formal...,CIS TECHNOLOGIES<CIH> TO SELL SHARES TO SWISS CO,"[acq, usa, switzerland]"
5,Qtly div 42 cts vs 41.5 cts prior Payable APri...,COPLEY PROPERTIES INC <COP> INCREASES DIVIDEND,"[earn, usa]"
6,Colombia's cost of living index rose 2.71 pct ...,COLOMBIAN INFLATION STABLE AT AROUND 20 PCT,"[cpi, colombia]"
7,The Federal Home Loan Bank Board said home mor...,FHLBB SAYS MORTGAGE RATES CONTINUE DECLINE,"[interest, usa]"
8,The New York Stock Exchange said a seat on the...,"NYFE SEAT SELLS FOR 1,500 DLRS","[usa, nyse]"
9,,CANADIAN MONEY SUPPLY M-1 FALLS 291 MLN DLRS I...,"[money-supply, canada]"


#### How many documents in the dataset? 

In [46]:
df['title'].describe()['count']


21578

In [47]:
df['body']

0        Chrysler Corp said car sales for the March 21-...
1        Compaq Computer Corp, IBM's chief rival in the...
2        <Noranda Inc> said production will remain shut...
3        The Canadian government's budget deficit rose ...
4        CIS Technologies Inc said it executed a formal...
5        Qtly div 42 cts vs 41.5 cts prior Payable APri...
6        Colombia's cost of living index rose 2.71 pct ...
7        The Federal Home Loan Bank Board said home mor...
8        The New York Stock Exchange said a seat on the...
9                                                         
10       Beneficial Corp said the sale of its American ...
11       European options exchanges will see spectacula...
12       Tierco Group INc said it sold at par to the Ku...
13       Shr 5.56 dlrs vs 3.88 dlrs Net 47.5 mln vs 33....
14       California Micro Devices Corp said an addition...
15       Stewart INformation Services Corp said it resc...
16       FIserve Inc said 14 savings and loans with 1.5.

There are 21,578 as seen below. 

In [48]:
#Some exploring of dataset: 
if q1Verbose:
    print(list(df))
    print(df['body'].describe())
    # Since df['topics'].descibe() does not work, let's just print. 
    print(df['topics'])
    print(type(df['topics']))

['body', 'title', 'topics']
count     21578
unique    18763
top            
freq       2535
Name: body, dtype: object
0                                         [usa]
1                                         [usa]
2                              [copper, canada]
3                                      [canada]
4                       [acq, usa, switzerland]
5                                   [earn, usa]
6                               [cpi, colombia]
7                               [interest, usa]
8                                   [usa, nyse]
9                        [money-supply, canada]
10                                   [acq, usa]
11                     [netherlands, ase, cboe]
12                                        [usa]
13                               [earn, canada]
14                                        [usa]
15                                        [usa]
16                                        [usa]
17                                        [usa]
18                

#### Each document can belong to a few topics (we interperted topics for categories). We will feed all information into a FreqDist and receive statistics.

As implied, this means a document can appear in several documents. 

In [7]:
#Create a list of all occurences of all topics and feed to FreqDist. 
freq_dist = nltk.FreqDist(sum(list(df['topics']), []))

#### How many categories: 

In [8]:
category_set=set(sum(list(df['topics']), []))
num_of_categories = len(category_set)
print("The number of categories is: ",num_of_categories)

The number of categories is:  445


Since we are using a Frequency distribution, the number of categories can also be retrieved by: 


In [9]:
len(freq_dist)


445

#### How many documents per category: 
Since we are using a Frequency distribution, the number of documents per category 
is the value in freq_dist[category]. 

We can print a list of all categories and the number of documents in them. 

In [10]:
cat_numOfDocs = [(category, freq_dist[category]) for category in category_set]
for pair in cat_numOfDocs[:10]: 
    print('Category: ', pair[0], 'has ', pair[1], 'Docs')

Category:  sourrouille has  4 Docs
Category:  lin-oil has  2 Docs
Category:  takeshita has  4 Docs
Category:  petricioli has  5 Docs
Category:  haiti has  8 Docs
Category:  ipe has  2 Docs
Category:  housing has  21 Docs
Category:  oilseed has  192 Docs
Category:  money-supply has  190 Docs
Category:  ongpin has  25 Docs


####  Provide mean and standard deviation, min and max. 
Mean: Mean number of documents per categorie. 

In [12]:
#[num_of_docs for _, num_of_docs in cat_numOfDocs]

In [13]:
#Mean: 
#Sum of number of documents per each category. 
sum_docs_cat = sum(num_of_docs for (cat, num_of_docs) in cat_numOfDocs)
#
#Mean expected number of documents per categorie. 
#mean_exp=sum(freq_dist.freq(cat)*num_of_docs for (cat, num_of_docs) in cat_numOfDocs)
#mean2 = np.mean([num_of_docs for _, num_of_docs in cat_numOfDocs])
#print('The Mean number of documents per categorie is: ', mean_exp)
#print('The Mean2 number of documents per categorie is: ', mean2)
mean = sum_docs_cat/len(cat_numOfDocs)
print('The Mean (Average) number of documents per category is: ', mean)

#Max:
print('The category with maximum documents is: "',freq_dist.max(), '"which has ', freq_dist[freq_dist.max()], ' documents.')

#Min:
min_num_of_docs = sorted(cat_numOfDocs ,key=lambda x: x[1])[0][1]
cats_w_min_num_of_docs = [cat for (cat, num_of_docs) in cat_numOfDocs if num_of_docs==min_num_of_docs]
display = 3 #Display only part of categories, not all. 
print('The category with minimum documents are:',cats_w_min_num_of_docs[:display], 'who have', min_num_of_docs, 'documents each. ' )

#Standard deviation: 
std_dev = math.sqrt(sum( (math.pow(num_of_docs-mean, 2) for (_, num_of_docs) in cat_numOfDocs))/len(cat_numOfDocs))
print('The standard deviation in number of documents per category is:', std_dev)

The Mean (Average) number of documents per category is:  89.87191011235954
The category with maximum documents is: " usa "which has  12542  documents.
The category with minimum documents are: ['lin-meal', 'mitterrand', 'bfr'] who have 1 documents each. 
The standard deviation in number of documents per category is: 643.9321684195971


#### Q1.1.3 Explore how many characters and words are present in the documents of the dataset.

first we consider all diffferent word tokens and characters in code, as in a set of elements. We then calculate 
the number of all tokens and characters all together, which is more relevent to our issues. 

In [49]:
#Create sets of words and characters. 
#Takes a while to run, use with care :)
if q113_verbose: 
    word_set=set()
    word_list=[]
    for i in range(len(df['body'])): 
        word_set.update(word_tokenize(df['body'][i]))
        word_list += word_tokenize(df['body'][i])

    char_set=set()
    char_list=[]
    for word in word_set: 
        for letter in word: 
            char_set.update(letter)
            char_list += letter
    print('There are %d different words in all documents. ' %len(word_set))
    print('There are %d word tokens in all documents. ' %len(word_list))
    print('There are %d different characters in all documents. ' %len(char_set))
    print('There are %d characters in all documents. ' %len(char_list))


Since runtime is long for the two above boxes, Output given here, no need to run. 

Output: 
There are 76886 different words in all documents. 
There are 2854622 word tokens in all documents. 
There are 89 different characters in all documents. 
There are 568599 characters in all documents. 


We will now construct a dictionary, That maps from article index to {num_of_words: , num_of_chars: }

In [50]:
article_2words_chars = {}
for i in range(len(df['body'])): 
    article_2words_chars[i] = (len(word_tokenize(df['body'][i])), len(df['body'][i]))

In [51]:
def explore_doc(i):
    print('Document with index %d has %d words and %d letters' % (2, article_2words_chars[x][0], article_2words_chars[x][1]))

#### Q1.1.4 Explain informally what are the classifiers that support the "partial-fit" method discussed in the code.

Informally, the classifiers that support "partial-fit", are classifiers who do not need to "hold" all the
information they are given, at every given moment. If we attempt a slightly more formal explanation, We 
can say that the state of the classifier is changed as it learns from more inputs, yet this input is not
a state variable. 

#### Q1.1.5 Explain what is the hashing vectorizer used in this tutorial.
####            Why is it important to use this vectorizer to achieve "streaming classification"?

As We have seen, We are dealing with a large amount of data. In order to make our data easier to process, 
We turn it into a sparse matrix that improves our memory usage by changing words into corresponding integers. 

### Q1.2 Spam Dataset

In [53]:
import os
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score

NEWLINE = '\n'

HAM = 'ham'
SPAM = 'spam'

SOURCES = [
    ('data/spam',        SPAM),
    ('data/easy_ham',    HAM),
    ('data/hard_ham',    HAM),
    ('data/beck-s',      HAM),
    ('data/farmer-d',    HAM),
    ('data/kaminski-v',  HAM),
    ('data/kitchen-l',   HAM),
    ('data/lokay-m',     HAM),
    ('data/williams-w3', HAM),
    ('data/BG',          SPAM),
    ('data/GP',          SPAM),
    ('data/SH',          SPAM)
]

SKIP_FILES = {'cmds'}

In [54]:

def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="latin-1")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content


In [55]:
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

In [56]:
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(numpy.random.permutation(data.index))

pipeline = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
    ('classifier',         MultinomialNB())
])

In [57]:
k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values.astype(str)

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values.astype(str)

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

In [None]:
print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)

#### Q1.2.1 The vectorizer used in Zac Stewart's code is a CountVectorizer with unigrams and bigrams. Report the number of unigrams and bigrams used in this model.

#### Q1.2.1 The vectorizer used in Zac Stewart's code is a CountVectorizer with unigrams and bigrams. Report the number of unigrams and bigrams used in this model.

In [59]:
# Retreive the count vectorizer used in the model. 
p=pipeline.get_params()
CountV=p['count_vectorizer']
#Access features: 
uni_bi_grams = CountV.get_feature_names()
print("There are %d unigrams and bigrams, used in this model. " %len(uni_bi_grams))

There are 1973279 unigrams and bigrams, used in this model. 


#### Q1.2.2 What are the 50 most frequent unigrams and bigrams in the dataset?


In [116]:
def most_freq_feat(classifier, count_vector, n=50):
    index = 0
    features_c1_c2_count = []

    for feat, c1, c2 in zip(count_vector.get_feature_names(), classifier.feature_count_[0], classifier.feature_count_[1]):
        features_c1_c2_count.append((feat, c1 + c2))
        index+=1

    for i in sorted(features_c1_c2_count, key = lambda x: x[1], reverse=True)[:n]:     
        print(i)
    

    
most_freq_feat(p['classifier'], p['count_vectorizer'], n=3)

('the', 270657.0)
('to', 188408.0)
('and', 139349.0)


#### What are the 50 most frequent unigrams and bigrams per class (ham and spam)?

In [119]:
#Create a list of feature name and amount of occurrences in each class. 
#Sort according to different class counter to get occurrences per class. 
def most_occurring_feat_per_class(classifier, count_vector, n=50):
    index = 0
    features_c1_c2_count = []

    for feat, c1, c2 in zip(count_vector.get_feature_names(), classifier.feature_count_[0], classifier.feature_count_[1]):
        features_c1_c2_count.append((feat, c1, c2))
        index+=1

    print("%d most occurring features in class spam: " %n )    
    for i in sorted(features_c1_c2_count, key = lambda x: x[1], reverse=True)[:n]:     
        print(i)

    print("%d most occurring features in class ham: " %n )    
    for i in sorted(features_c1_c2_count, key = lambda x: x[2], reverse=True)[:n]:     
        print(i)


    
most_occurring_feat_per_class(p['classifier'], p['count_vectorizer'], n=1)

1 most occurring features in class spam: 
('the', 202071.0, 68586.0)
1 most occurring features in class ham: 
('font', 11332.0, 91677.0)


#### Q1.2.4 List the 20 most useful features in the Naive Bayes classifier to distinguish between spam and ham (20 features for each class). 

In [120]:
#Since each features coefficient links it to it's class, and smaller coefficients classify spam and larger ham, 
#we sort according to coefficient, once normaly and once reversed, to get most informative features. 
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=20):
    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
    
    counter=0
    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)
        if counter==20: 
            break
    print()
    
    counter=0
    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)
        counter+=1
        if counter==20: 
            break

most_informative_feature_for_binary_classification(p['count_vectorizer'], p['classifier'], n=2)

ham -16.0793596043 00 005
ham -16.0793596043 00 00am

spam -4.65332188763 font
spam -4.80685101935 br


#### Q1.2.5 There seems to be an imbalance in the length of spam and ham messages (see the plot in the attached notebook). We want to add a feature based on the number of words in the message in the text representation. Should the length attribute be normalized before fitting the Naive Bayes classifier? (See Sklearn pre-processing for examples.) Do you expect Logistic Regression to perform better with the new feature? Explain.

In [130]:
def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text,'len': len(nltk.tokenize.word_tokenize(text)), 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

data1 = DataFrame({'text': [], 'len': [], 'class': []})
for path, classification in SOURCES:
    data1 = data1.append(build_data_frame(path, classification))

data1 = data1.reindex(numpy.random.permutation(data.index))

pipeline = Pipeline([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
    ('classifier',         MultinomialNB())
])



In [139]:
data1 = DataFrame({'text': [], 'len': [], 'class': []})
for path, classification in SOURCES:
    data1 = data1.append(build_data_frame(path, classification))

data1 = data1.reindex(numpy.random.permutation(data.index))



KeyboardInterrupt: 

In [1]:
from sklearn.pipeline import FeatureUnion

pipeline = Pipeline([
#  ('features', FeatureUnion([
#        ('body_stats', Pipeline([
#                ('stats', TextStats()),  # returns a list of dicts
#                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
#                                ])),
        ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),   
#                            ])),
  ('classifier', MultinomialNB())
])


NameError: name 'Pipeline' is not defined

In [2]:
k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values.astype(str)

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values.astype(str)

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)
    

print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)

NameError: name 'KFold' is not defined

In [132]:
from sklearn.feature_extraction import DictVectorizer
>>> v = DictVectorizer(sparse=False)
>>> d = [{'height': 1, 'length': 0, 'width': 1},
...      {'height': 2, 'length': 1, 'width': 0},
...      {'height': 1, 'length': 3, 'width': 2}]
>>> v.fit_transform(d)
array([[ 1.,  0.,  1.],   # obs.2
       [ 2.,  1.,  0.],   # obs.1
       [ 1.,  3.,  2.]])  # obs.3
   # height, len., width   
    


NameError: name 'array' is not defined

In [133]:
CountVectorizer(ngram_range=(1, 2))

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [134]:
c=CountVectorizer(ngram_range=(1, 2))

In [135]:
c.fit(train_text, train_y)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [136]:
#data1

In [137]:
from sklearn.base import BaseEstimator, TransformerMixin    
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to sklearn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


In [157]:
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'length': len(text)}
                for text in posts]


In [None]:
new_pipeline = Pipeline([
  ('features', FeatureUnion([
    ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
    ('body_stats', Pipeline([
                ('selector', ItemSelector(key='body')),
                ('stats', TextStats()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),
                            ])),
  ('classifier', MultinomialNB())
])

 


### Q2. Named Entity Recognition
    Features:
            we are intrested in creating a vectorized obejct from our data set, which will take into consideration the
            (1) word-form (2)the POS of the word (3) ORT, (4) perfix1 ,(5) perfix2, (6) perfix3, (7) suffix1 
            (8) suffix2, (9) suffix3

In [None]:
first let's load and split our data set to test and train sets

In [None]:
# Split data set to train and test data sets
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

 we've noticed that our data is build as a list of sentences, all of which are constructed from a list of tripules in the following foramt:

In [None]:
train_sents[0]CountV.get_stop_words()
CountV.get_params()['analyzer']

<div style='padding: 10px;'><code>[  [(&lt;WORD&gt;, &lt;POS&gt;, &lt;CLASS&gt;),....],<br />&nbsp;....[]&nbsp;...<br/>]</code>
</div><br/>
We would like to add another features, and will do that in a manner simmilar to the one being done
in the <a href="http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb" target="_blank">CoNLL Classification</a> we'll build out a dictionary with all the
wanted features and use <i><u>DictVectorizer</u></i> to get a vectorized representation of the word
according to it's features