# Imports

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk

# Open Test Data from file

In [2]:
with open('td') as data:
    text = data.readlines()

In [3]:
text

['Where do rings come from? Did they form together with the planets out of the solar nebula, or are they a more recent phenomenon? An important clue is that the rings lie close to their planet in a region where tidal forces are very strong.\n',
 'Within two to three planetary radii of any planet, the tidal forces tugging apart an object become comparable to the gravitational forces holding it together. (This region is often called the Roche tidal zone.) Only relatively small objects held together by nongravitational forces\xe2\x80\x94such as the electromagnetic forces that hold solid rock, spacecraft, and human beings together can avoid being ripped apart by the strong tidal forces in this region. Because a large moon would be ripped apart by tidal forces in the region of the rings, scientists once suspected that Saturn\xe2\x80\x99s rings formed after a large moon came too close to the planet. However, moons don\xe2\x80\x99t simply \xe2\x80\x9cwander\xe2\x80\x9d away from their orbits,

# Extract Sentences

In [4]:
import re

In [5]:
sentences = []
# split sentences on punctuation
for x in text:
    sentences += re.split('[.!?]',x)

In [6]:
sentences

['Where do rings come from',
 ' Did they form together with the planets out of the solar nebula, or are they a more recent phenomenon',
 ' An important clue is that the rings lie close to their planet in a region where tidal forces are very strong',
 '\n',
 'Within two to three planetary radii of any planet, the tidal forces tugging apart an object become comparable to the gravitational forces holding it together',
 ' (This region is often called the Roche tidal zone',
 ') Only relatively small objects held together by nongravitational forces\xe2\x80\x94such as the electromagnetic forces that hold solid rock, spacecraft, and human beings together can avoid being ripped apart by the strong tidal forces in this region',
 ' Because a large moon would be ripped apart by tidal forces in the region of the rings, scientists once suspected that Saturn\xe2\x80\x99s rings formed after a large moon came too close to the planet',
 ' However, moons don\xe2\x80\x99t simply \xe2\x80\x9cwander\xe2\x80

# Get Training Data

In [7]:
from sklearn.datasets import fetch_20newsgroups

In [8]:
newsgroups_train = fetch_20newsgroups(subset='train')

# Train a model with data

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
train_counts = count_vec.fit_transform(newsgroups_train.data)

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer().fit(train_counts)
train_transformed = tf_transformer.transform(train_counts)

# Clean Glossary Data

In [11]:
def remove_numeric(data):
    import re
    regex = re.compile("([a-zA-Z]+)")
    match = regex.match(data)
    if (match is not None):
        return data[match.start():match.end()]
    return None
def convert_glossary_data(glossary_lines):
    map(remove_numeric,glossary_lines)

# Convert the Test data to a usable form

In [12]:
fast_data = pd.Series(sentences)

In [13]:
fast_data

0                              Where do rings come from
1      Did they form together with the planets out o...
2      An important clue is that the rings lie close...
3                                                    \n
4     Within two to three planetary radii of any pla...
5      (This region is often called the Roche tidal ...
6     ) Only relatively small objects held together ...
7      Because a large moon would be ripped apart by...
8      However, moons don’t simply “wander” away fro...
9              Such encounters should be extremely rare
10     Back in the days when Saturn was the only pla...
11     The discovery of rings around the other jovia...
12                                                   \n
13    If the rings did not originate with the destru...
14     Another idea that once seemed reasonable was ...
15     This would explain why all four jovian planet...
16     However, we now know that the ring particles ...
17     Ring particles are continugally being gro

In [14]:
count_data = count_vec.transform(fast_data)
count_data.shape

(24, 130107)

In [15]:
weighted_scores = count_data * train_transformed.T

In [16]:
sentences_scores = weighted_scores.sum(axis=1)
normalized_sentence_scores = sentences_scores / sentences_scores.max()
normalized_sentence_scores

matrix([[ 0.046765  ],
        [ 0.41874977],
        [ 0.37159191],
        [ 0.        ],
        [ 0.49555882],
        [ 0.18629907],
        [ 0.49077952],
        [ 0.63746292],
        [ 0.33292197],
        [ 0.0374728 ],
        [ 0.6885371 ],
        [ 0.56821315],
        [ 0.        ],
        [ 0.5252031 ],
        [ 0.7319301 ],
        [ 0.1771743 ],
        [ 0.6957093 ],
        [ 1.        ],
        [ 0.35288282],
        [ 0.63755349],
        [ 0.09425776],
        [ 0.57024616],
        [ 0.78773926],
        [ 0.        ]])

In [17]:
summarized_count = 0

for x in xrange(normalized_sentence_scores.shape[0]):
    if (normalized_sentence_scores[x] > 0.6):
        summarized_count += 1
        print fast_data[x]
        
print "Compression: %s %%" % (float(summarized_count)/len(fast_data) * 100)

 Because a large moon would be ripped apart by tidal forces in the region of the rings, scientists once suspected that Saturn’s rings formed after a large moon came too close to the planet
 Back in the days when Saturn was the only planet known to have rings, it might have seemed possible that such a rare encounter was responsible for the rings
 Another idea that once seemed reasonable was that the ring particles might be leftover chunks of rock and ice that condensed in the disks of gas that orbited each jovian planet when it was young
 However, we now know that the ring particles cannot be leftogvers from the births of the planets, because they could not have survived for billions of years
 Ring particles are continugally being ground down in size, primarily by the impacts of the countless sand-size particles that orbit the Sun—the same types of particles that become meteors in Earth’s atmosphere and cause micrometeorite impacts on the Moon
 The dust particles could not have survived