In [10]:
from os import listdir
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a document into news story and highlights
def split_story(doc):
	# find first highlight
	index = doc.find('@highlight')
	# split into story and highlights
	story, highlights = doc[:index], doc[index:].split('@highlight')
	# strip extra white space around each highlight
	highlights = [h.strip() for h in highlights if len(h) > 0]
	return story, highlights

# load all stories in a directory
def load_stories(directory):
	stories = list()
	for name in listdir(directory):
		filename = directory + '/' + name
		# load document
		doc = load_doc(filename)
		# split into story and highlights
		story, highlights = split_story(doc)
		# store
		stories.append({'story':story, 'highlights':highlights})
	return stories

# clean a list of lines
def clean_lines(lines):
	cleaned = list()
	# prepare a translation table to remove punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines:
		# strip source cnn office if it exists
		index = line.find('(CNN) -- ')
		if index > -1:
			line = line[index+len('(CNN)'):]
		# tokenize on white space
		line = line.split()
		# convert to lower case
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [w.translate(table) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		cleaned.append(' '.join(line))
	# remove empty strings
	cleaned = [c for c in cleaned if len(c) > 0]
	return cleaned

# load stories
directory = 'cnn/stories/'
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))

# clean stories
for example in stories:
	example['story'] = clean_lines(example['story'].split('\n'))
	example['highlights'] = clean_lines(example['highlights'])

Loaded Stories 92579


In [11]:
print(stories[4]['highlights'])

['louisiana gov bobby jindal decried nogo zones where sovereign governments cede authority to muslims', 'a fox news commentator sparked controversy when he mentioned the idea last week which has been debunked', 'jindal stuck to his speech however and drew praise from conservatives']


In [12]:
print(stories[4]['story'])

['cnnlouisiana gov bobby jindal on monday stood by his criticism of socalled nogo zones in europe where sovereign nations allegedly cede authority to muslim immigrants a controversial idea that many critics say is overblown', 'and the potential republican presidential candidate decried what he called immigrants insistence on nonassimilation the fact that youve got people who want to come to our country but not adopt our values which he called dangerous', 'jindal has the reputation of policy wonk among republicans and boldly told gop officials during a address to the republican national committee that they must stop being the stupid party and needed to stop insulting the intelligence of voters', 'asked by cnns wolf blitzer whether he would walk back his comments on nogo zones made in a speech to the henry jackson society in london earlier that day jindal said not at all', 'msnbc guest wont apologize for jindal remark', 'and im also making a bigger and maybe even more controversial point

In [13]:
stories[0]['highlights']

['usbased scientists say their data points toward the existence of the higgs boson',
 'finding the higgs boson would help explain the origin of mass',
 'but the research at the tevatron collider doesnt provide a conclusive answer',
 'attention now turns to a seminar wednesday on data from the large hadron collider']

In [14]:
# save to file
from pickle import dump
dump(stories, open('cnn_dataset.pkl', 'wb'))

In [15]:
# load from file
stories = load(open('cnn_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(stories))

NameError: name 'load' is not defined