In [4]:
"""We do not have to write a custom code for counting words and representing those counts as a vector. Scikit's CountVectorizer 
does the job very efficiently."""

import os
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 1) #The parameter min_df determines how CountVectorizer treats words that are not used 
#frequently (minimum document frequency).

In [5]:
#Just a dummy example to show how CountVectorizer creates vectors based on the frequency of words in a post.
content = ["How to format my hard disk", "Hard disk format problems "]
X = vectorizer.fit_transform(content)
vectorizer.get_feature_names()

[u'disk', u'format', u'hard', u'how', u'my', u'problems', u'to']

In [6]:
#The vectorizer detected seven words for which we can fetch the counts individually as follows:
print(X.toarray().transpose())

[[1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 0]]


In [7]:
#Playing with a toy dataset. In this post dataset, we want to find the most similar post for the short post "imaging databases".
#Assuming that the posts are located in 'D:\ToStudy\Machine Learning\Data\DIR', we can feed CountVectorizer with it as follows:

posts = [open(os.path.join('D:\ToStudy\Machine Learning\Data\DIR', f)).read() for f in 
         os.listdir('D:\ToStudy\Machine Learning\Data\DIR')]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 1)

In [8]:
#Notifying the vectorizer about the full dataset so that it knows upfront what words are to be expected:

X_train = vectorizer.fit_transform(posts)
num_samples, num_features = X_train.shape
print ("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 5, #features: 25


In [9]:
#Unsurprisingly, we have five posts with a total of 25 different words. The following are the tokenized words which will be 
#counted.

print(vectorizer.get_feature_names())

[u'about', u'actually', u'capabilities', u'contains', u'data', u'databases', u'images', u'imaging', u'interesting', u'is', u'it', u'learning', u'machine', u'most', u'much', u'not', u'permanently', u'post', u'provide', u'safe', u'storage', u'store', u'stuff', u'this', u'toy']


In [10]:
#Now we can vectorize our new post as follows:

new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

In [11]:
"""Note that the count vectors returned by the 'transform' method are sparse. That is, each vector does not store one count
value for each word, as most fo those counts would be zero (post does not contain the word). Instead, it uses the more memory 
efficient implementation 'coo_matrix' (for 'COOrdinate'). Our new post, for instance, actually contains only two elements:"""

print(new_post_vec)

  (0, 5)	1
  (0, 7)	1


In [12]:
#Via its member 'toarray()', we can gain access to the full ndarray as follows:
print(new_post_vec.toarray())

[[0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [13]:
"""We need to use full array if we want to use it as a vector for similarity calculations.For the simialrity measurement(the 
naive one),we calculate the Eucledian distance between the count vectors of the new post and all the old posts as follows:"""

import scipy as sp
def dist(v1, v2):
    delta = v1 - v2
    return sp.linalg.norm(delta.toarray()) # The 'norm()' function calculates the Eucledian norm (shortest distance).

In [14]:
#With 'dist()', we just need to iterate over all the posts and remember the nearest one:

import sys
best_doc = None
best_dist = sys.maxint
best_i = None

for i in range(0, num_samples):
    post = posts[i]
    
    if post == new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist(post_vec, new_post_vec)
    print "=== Post %i with dist = %.2f: %s"%(i, d, post)
    
    if d < best_dist:
        best_dist = d
        best_i = i

print(">>> Best post is %i with dist = %.2f"%(best_i, best_dist))

=== Post 0 with dist = 4.00: This is a toy post about Machine Learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 1.73: Imaging databases provide storage capabilities.
=== Post 2 with dist = 2.00: Most imaging databases safe images permanently.
=== Post 3 with dist = 1.41: Imaging databases store data.
=== Post 4 with dist = 5.10: Imaging databases store data. Imaging databases store data. Imaging databases store data.
>>> Best post is 3 with dist = 1.41


In [15]:
"""All the measurements make sense except posts 3 and 4. Post 4 is the same as Post 3, duplicated 3 times. So, it should be of 
the same similarity to the new post as Post 3.
Printing the corresponding feature vectors explains the reason:"""

print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())

[[0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]
[[0 0 0 0 3 3 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0]]


In [16]:
#Obviously, using only the counts of the raw words is too simple. We will have to normalize them to get vectors of unit length.
#We will have to extend 'dist()' to calculate the vector distance, not on the raw vectors but on the normalized ones instead:

def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

In [17]:
#With 'dist_norm()', we just need to iterate over all the posts and remember the nearest one.

import sys
best_doc = None
best_dist = sys.maxint
best_i = None

for i in range(0, num_samples):
    post = posts[i]
    
    if post == new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec, new_post_vec)
    print "=== Post %i with dist = %.2f: %s"%(i, d, post)
    
    if d < best_dist:
        best_dist = d
        best_i = i

print(">>> Best post is %i with dist = %.2f"%(best_i, best_dist))

#This leads to the following similarity measurement.

=== Post 0 with dist = 1.41: This is a toy post about Machine Learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.92: Most imaging databases safe images permanently.
=== Post 3 with dist = 0.77: Imaging databases store data.
=== Post 4 with dist = 0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
>>> Best post is 3 with dist = 0.77


In [18]:
#This looks a bit better now. Post 3 and Post 4 are calculated as being equally similar. One could argue whether that much 
#repetition would be a delight to the reader, but from the point of view of counting the words in the posts, this seems to be 
#right.

In [19]:
"""Let us have another look at Post 2. Of its words that are not in the new post, we have 'most', 'safe', 'images', and 
'permanently'. They are actually quite different in the overall importnace to the post.

Words such as 'most' appear very often in all sorts of different context, and words such as this are called STOP WORDS. They 
do not carry as much information, and thus should not be weighed as much as words such as 'images', that don't occur often in 
different contexts. 

The best option would be to remove all words that are so frequent that they do not help to distinguish between different texts.
These words are called STOP WORDS."""

#As this is such a common step in text processing, there is a simple parameter in CountVectorizer to achieve this, as follows:

vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')

"""If you have a clear picture of what kind os top words you would want to remove, you can also pass a list of them. Setting 
'stop_words' to 'english' will use a set of 318 English stop words. We can find out these stop words by using 
'get_stop_words()':"""

sorted(vectorizer.get_stop_words())[0:100]

['a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'bill',
 'both',
 'bottom',
 'but',
 'by',
 'call',
 'can',
 'cannot',
 'cant',
 'co',
 'con',
 'could',
 'couldnt',
 'cry',
 'de',
 'describe',
 'detail',
 'do',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eg',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'etc',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fify',
 'fill',
 'find',
 'fire',
 'first',
 'five',
 'for',

In [20]:
posts = [open(os.path.join('D:\ToStudy\Machine Learning\Data\DIR', f)).read() for f in 
         os.listdir('D:\ToStudy\Machine Learning\Data\DIR')]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')

X_train_new = vectorizer.fit_transform(posts)
num_samples, num_features = X_train_new.shape

#print(vectorizer.get_feature_names())

print ("#samples: %d, #features: %d" % (num_samples, num_features))

new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

#The new word list is seven words lighter, thanks to the removal of stop words:

#samples: 5, #features: 18


In [21]:
import sys
best_doc = None
best_dist = sys.maxint
best_i = None

for i in range(0, num_samples):
    post = posts[i]
    
    if post == new_post:
        continue
    post_vec = X_train_new.getrow(i)
    d = dist_norm(post_vec, new_post_vec)
    print "=== Post %i with dist = %.2f: %s"%(i, d, post)
    
    if d < best_dist:
        best_dist = d
        best_i = i

print(">>> Best post is %i with dist = %.2f"%(best_i, best_dist))

#Without stop words, we arrive at the following similarity measurement:

=== Post 0 with dist = 1.41: This is a toy post about Machine Learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.86: Most imaging databases safe images permanently.
=== Post 3 with dist = 0.77: Imaging databases store data.
=== Post 4 with dist = 0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
>>> Best post is 3 with dist = 0.77


In [22]:
"""Post 2 is now on par with Post 1. Overall, it has, however, not changed much as our posts are kept short for demostration 
purposes. It will become vital when we look at real-world data."""

'Post 2 is now on par with Post 1. Overall, it has, however, not changed much as our posts are kept short for demostration \npurposes. It will become vital when we look at real-world data.'

In [23]:
"""One thing is still missing. We count similar words in different variants as different words. Post 2, for instance,contains 
'imaging' and 'images'. It would make sense to count them together. After all, it is the same concept they are referring to.

We need a function that reduces words to their specific word stem. Scikit does not contain a stemmer by default.With the NATURAL 
LANGUAGE TOOLKIT (NLTK), we can download a free software toolkit, which provides a stemmer that we can easily plug into 
'CountVectorizer'."""

import nltk

In [24]:
#NLTK comes with different stemmers. This is necessary, because every language has a different set of rules for stemming.
#For English, we can take 'SnowballStemmer'.

import nltk.stem
s = nltk.stem.SnowballStemmer('english')

In [25]:
#A few examples

print s.stem("graphics")

print s.stem("imaging")

print s.stem("image")

print s.stem("imagination")

print s.stem("imagine")

graphic
imag
imag
imagin
imagin


In [26]:
#It also works with verbs as follows:

print s.stem("buys")

print s.stem("buying")

print s.stem("bought")

buy
buy
bought


In [27]:
import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
    
vectorizer = StemmedCountVectorizer(min_df = 1, stop_words = 'english')

"""This will perform the following steps for each post:
1. Lower casing the raw post in the preprocessing step (done in the parent class).
2. Extracting all individual words in the tokenizing step (done in the parent class).
3. Converting each word into its stemmed version."""

X_train_new = vectorizer.fit_transform(posts)
num_samples, num_features = X_train_new.shape

#As a result, we now have one feature less, because 'images' and 'imaging' collapsed to one.
print ("#samples: %d, #features: %d" % (num_samples, num_features))

print(vectorizer.get_feature_names())

#samples: 5, #features: 17
[u'actual', u'capabl', u'contain', u'data', u'databas', u'imag', u'interest', u'learn', u'machin', u'perman', u'post', u'provid', u'safe', u'storag', u'store', u'stuff', u'toy']


In [28]:
new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

In [29]:
import sys
best_doc = None
best_dist = sys.maxint
best_i = None

for i in range(0, num_samples):
    post = posts[i]
    
    if post == new_post:
        continue
    post_vec = X_train_new.getrow(i)
    d = dist_norm(post_vec, new_post_vec)
    print "=== Post %i with dist = %.2f: %s"%(i, d, post)
    
    if d < best_dist:
        best_dist = d
        best_i = i

print(">>> Best post is %i with dist = %.2f"%(best_i, best_dist))

#Without stop words, we arrive at the following similarity measurement:

=== Post 0 with dist = 1.41: This is a toy post about Machine Learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 0.86: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.63: Most imaging databases safe images permanently.
=== Post 3 with dist = 0.77: Imaging databases store data.
=== Post 4 with dist = 0.77: Imaging databases store data. Imaging databases store data. Imaging databases store data.
>>> Best post is 2 with dist = 0.63


In [30]:
"""Now, we would also like to assign higher importance to those terms which occur often in an article and very rarely anywhere 
else since this means that the term holds special value to that particular article. This can only be solved by counting term 
frequencies for every post, and in addition, discounting those that appear in many posts."""

"""This is exactly what TERM FREQUENCY - INVERSE DOCUMENT FREQUENCY (TF - IDF) does; TF stands for counting part, while IDF 
factors in the discounting."""

#A simple implementation for the caluclation for tf-idf: 
import scipy as sp
import math
def tfidf(term, doc, docset):
    tf = float(doc.count(term)) / sum(doc.count(term) for w in docset)
    idf = math.log(float(len(docset)) / (len([doc for doc in docset if term in doc])))
    return tf * idf

In [31]:
#An example of tf-idf calculation

a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"]
D = [a,abb, abc]

print(tfidf("a", a, D))

print(tfidf("a", abc, D))

print(tfidf("b", abc, D))

print(tfidf("c", abc, D))

0.0
0.0
0.135155036036
0.366204096223


In [32]:
"""We see that 'a' carries no meaning for any document since it is contained everywhere. 'c' is more important to 'abc' than 'b' 
is to 'abb' since 'c' occurs only once in only one of the document viz., 'abc'."""

"""Scikit already has a very efficient implementation of tfidf calculation in 'TfidfVectorizer', which is inherited from 
'CountVectorizer'. We will include this in our stemmer as follows: """

from sklearn.feature_extraction.text import TfidfVectorizer

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
    
vectorizer = StemmedTfidfVectorizer(min_df = 1, stop_words = 'english')

X_train_new = vectorizer.fit_transform(posts)
num_samples, num_features = X_train_new.shape

new_post = "imaging databases"
new_post_vec = vectorizer.transform([new_post])

import sys
best_doc = None
best_dist = sys.maxint
best_i = None

for i in range(0, num_samples):
    post = posts[i]
    
    if post == new_post:
        continue
    post_vec = X_train_new.getrow(i)
    d = dist_norm(post_vec, new_post_vec)
    print "=== Post %i with dist = %.2f: %s"%(i, d, post)
    
    if d < best_dist:
        best_dist = d
        best_i = i

print(">>> Best post is %i with dist = %.2f"%(best_i, best_dist))

#Without stop words, we arrive at the following similarity measurement:

=== Post 0 with dist = 1.41: This is a toy post about Machine Learning. Actually, it contains not much interesting stuff.
=== Post 1 with dist = 1.08: Imaging databases provide storage capabilities.
=== Post 2 with dist = 0.86: Most imaging databases safe images permanently.
=== Post 3 with dist = 0.92: Imaging databases store data.
=== Post 4 with dist = 0.92: Imaging databases store data. Imaging databases store data. Imaging databases store data.
>>> Best post is 2 with dist = 0.86


In [33]:
#Loading the Data Set
import sklearn.datasets
MLCOMP_DIR = r"D:\ToStudy\Machine Learning\Data\project_data"
data = sklearn.datasets.load_mlcomp("20news-18828", mlcomp_root=MLCOMP_DIR)
print(data.filenames)


[ 'D:\\ToStudy\\Machine Learning\\Data\\project_data\\379\\raw\\comp.graphics\\1190-38614'
 'D:\\ToStudy\\Machine Learning\\Data\\project_data\\379\\raw\\comp.graphics\\1383-38616'
 'D:\\ToStudy\\Machine Learning\\Data\\project_data\\379\\raw\\alt.atheism\\487-53344'
 ...,
 'D:\\ToStudy\\Machine Learning\\Data\\project_data\\379\\raw\\rec.sport.hockey\\10215-54303'
 'D:\\ToStudy\\Machine Learning\\Data\\project_data\\379\\raw\\sci.crypt\\10799-15660'
 'D:\\ToStudy\\Machine Learning\\Data\\project_data\\379\\raw\\comp.os.ms-windows.misc\\2732-10871']


In [50]:
# Printing Total Number of Articles
print(len(data.filenames))


18828


In [35]:
data.target_names


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [36]:
#Training Set 
train_data = sklearn.datasets.load_mlcomp("20news-18828", "train", mlcomp_root=MLCOMP_DIR)
print(len(train_data.filenames))

13180


In [37]:
#Test Set 
test_data = sklearn.datasets.load_mlcomp("20news-18828", "test", mlcomp_root=MLCOMP_DIR)
print(len(test_data.filenames))

5648


In [38]:
"""For simplicity's sake, we will restrict ourselves to only some newsgroups so that the 
overall experimentation cycle is shorter. We can achieve this with the categories 
parameter as follows:"""
groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.ma c.hardware', 'comp.windows.x',
          'sci.space']
train_data = sklearn.datasets.load_mlcomp("20news-18828", "train", mlcomp_root=MLCOMP_DIR, categories=groups)
print(len(train_data.filenames))

3414


In [39]:
"""Real data is noisy. The newsgroup 
dataset is no exception. It even contains invalid characters that will result in 
UnicodeDecodeError.
We have to tell the vectorizer to ignore them:"""

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
    
#vectorizer = StemmedTfidfVectorizer(min_df = 1, stop_words = 'english')
 
#export LC_CTYPE="en_US.UTF-8"

vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english',decode_error = 'ignore')
vectorized = vectorizer.fit_transform(data.data)
num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))

#samples: 18828, #features: 15924


In [40]:
"""We now have a pool of 18,828 posts and extracted for each of them a feature vector of 
15,924 dimensions. That is what KMeans takes as input. We will fix the cluster size to 
50 as shown in the following code:"""

'We now have a pool of 18,828 posts and extracted for each of them a feature vector of \n15,924 dimensions. That is what KMeans takes as input. We will fix the cluster size to \n50 as shown in the following code:'

In [41]:
num_clusters = 50
from sklearn.cluster import KMeans
km = KMeans(n_clusters=num_clusters, init='random', n_init=1,verbose=1) 
km.fit(vectorized)

Initialization complete
Iteration  0, inertia 33837.414
Iteration  1, inertia 17962.825
Iteration  2, inertia 17786.917
Iteration  3, inertia 17690.956
Iteration  4, inertia 17643.360
Iteration  5, inertia 17616.664
Iteration  6, inertia 17600.725
Iteration  7, inertia 17576.889
Iteration  8, inertia 17561.024
Iteration  9, inertia 17554.252
Iteration 10, inertia 17551.381
Iteration 11, inertia 17548.903
Iteration 12, inertia 17546.374
Iteration 13, inertia 17544.371
Iteration 14, inertia 17542.687
Iteration 15, inertia 17541.061
Iteration 16, inertia 17539.893
Iteration 17, inertia 17539.054
Iteration 18, inertia 17538.136
Iteration 19, inertia 17537.640
Iteration 20, inertia 17537.415
Iteration 21, inertia 17537.264
Iteration 22, inertia 17537.216
Iteration 23, inertia 17537.171
Iteration 24, inertia 17537.148
Iteration 25, inertia 17537.139
Iteration 26, inertia 17537.134
Converged at iteration 26


KMeans(copy_x=True, init='random', max_iter=300, n_clusters=50, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=1)

In [42]:
"""That's it. After fitting, we can get the clustering information out of the members of 
km. For every vectorized post that has been fit, there is a corresponding integer label 
in km.labels_:"""
km.labels_
km.labels_.shape


(18828L,)

In [43]:
"""We now put everything together and demonstrate our system for the following new 
post that we assign to the variable 'new_post': """

new_post = "Torus is one the best multipurpose templates on the market. It is modern in every way. One the best features is the page builder that comes with it, allowing you to easily create a page without much effort. This feature is also awesome for non-developers. The modular design technique used in Torus, promotes customizability and easy branding. Choose Torus if you want to get your site up and running in as little time as possible."

"""We will first have to vectorize this post before we 
predict its label as follows:"""

new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]

In [44]:
"""Now that we have the clustering, we do not need to compare new_post_vec to all 
post vectors. Instead, we can focus only on the posts of the same cluster. Let us fetch 
their indices in the original dataset:"""

similar_indices = (km.labels_==new_post_label).nonzero()[0]

In [45]:
"""Using similar_indices, we then simply have to build a list of posts together with 
their similarity scores as follows:"""

similar = []
for i in similar_indices:
    dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
    similar.append((dist, data.data[i]))
similar = sorted(similar)
print(len(similar))

3349


In [46]:
show_at_1 = similar[0]
show_at_2 = similar[len(similar)/2]
show_at_3 = similar[-1]

In [47]:
print show_at_1




In [48]:
print show_at_2

(1.408517517170303, 'From: jenk@microsoft.com (Jen Kilmer)\nSubject: Re: sex education\n\nIn article <Apr.7.23.20.08.1993.14209@athos.rutgers.edu> mprc@troi.cc.rochester.edu (M. Price) writes:\n>In <Apr.5.23.31.32.1993.23904@athos.rutgers.edu> jenk@microsoft.com (Jen Kilmer) writes:\n>\n>> Method                  Expected         Actual \n>> ------                 Failure Rate    Failure Rate\n>> Abstinence                 0%              0% \n>\n>\n>    These figures don\'t seem to take account of rape. Or is a woman who\n>is raped considered not to have been abstaining?\n\nI no longer have the textbook, but abstinence was defined as something\nlike "no contact between the penis and the vagina, vulva, or area \nimmediately surrounding the vulva, and no transfer of semen to the\nvagina, vulva, or area surrounding the vulva".  \n\nThat is, abstinence wasn\'t discussed as "sex outside of marriage is\nmorally wrong" but as keep  the sperm away from the ovum and conception \nis impossible.

In [49]:
print show_at_3

(1.4142135623730954, 'Subject: Quotation? Lowest bidder...\nFrom: bioccnt@otago.ac.nz\n\n\nCan someone please remind me who said a well known quotation? \n\nHe was sitting atop a rocket awaiting liftoff and afterwards, in answer to\nthe question what he had been thinking about, said (approximately) "half a\nmillion components, each has to work perfectly, each supplied by the lowest\nbidder....." \n\nAttribution and correction of the quote would be much appreciated. \n\nClive Trotman\n\n')
