In [10]:
import numpy as np
import pandas as pd
import nltk
import textwrap
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [8]:
import requests
url = 'https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv'
r = requests.get(url, allow_redirects=True)

from io import StringIO
data = StringIO(r.text)
df=pd.read_csv(data)
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [11]:
labels=set(df['labels'])
labels

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [12]:
label='business'

In [13]:
texts=df[df['labels']==label]['text']
texts.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

In [15]:
probs={}
for doc in texts:
    lines=doc.split("\n")
    for line in lines:
        tokens=word_tokenize(line)
        for i in range(len(tokens)-2):
            t_0=tokens[i]
            t_1=tokens[i+1]
            t_2=tokens[i+2]
            key=(t_0,t_2)
            if key not in probs:
                probs[key]={}
            if t_1 not in probs[key]:
                probs[key][t_1]=1
            else:
                probs[key][t_1]+=1

In [17]:
for key,d in probs.items():
    total=sum(d.values())
    for k,v in d.items():
        d[k]=v/total

In [21]:
texts.iloc[0].split("\n")

['Ad sales boost Time Warner profit',
 '',
 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.',
 '',
 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.',
 '',
 "Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers 

In [22]:
def spin_documents(doc):
    lines=doc.split("\n")
    output=[]
    for line in lines:
        if line:
            new_line=spin_line(line)
        else:
            new_line=line
        output.append(new_line)
    return "\n".join(output)


In [23]:
detokenizer=TreebankWordDetokenizer()

In [25]:
texts.iloc[0].split("\n")[2]

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.'

In [26]:
detokenizer.detokenize(word_tokenize(texts.iloc[0].split("\n")[2]))

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.'

In [27]:
def sample_word(d):
    p0=np.random.random()
    cumulative=0
    for t,p in d.items():
        cumulative+=p
        if p0<cumulative:
            return t
    assert(False)

In [37]:
def spin_line(line):
    token=word_tokenize(line)
    i=0
    output=[token[0]]
    while i<(len(token)-2):
        t_0=token[i]
        t_1=token[i+1]
        t_2=token[i+2]
        key=(t_0,t_2)
        p_dist=probs[key]
        if len(p_dist)>1 and np.random.random() <0.3:
            middle=sample_word(p_dist)
            output.append(t_1)
            output.append("<"+ middle +">")
            output.append(t_2)
            i+=2
        else:
            output.append(t_1)
            i+=1
    if i ==len(tokens)-2:
        output.append(tokens[-1])
    return detokenizer.detokenize(output)



In [38]:
np.random.seed(1234)

In [39]:
i=np.random.choice(texts.shape[0])
doc=texts.iloc[i]
new_doc=spin_documents(doc)

In [41]:
print(textwrap.fill(new_doc,replace_whitespace=False,fix_sentence_endings=True))

Bombardier chief to leave

Shares in train and plane-making giant
Bombardier have fallen to <to> a 10-year low following <against> the
departure <hands> of its chief executive and two members of the <key>
board

Paul Tellier, who <which> was also Bombardier's president
<epicentre>, left the company amid an ongoing <Â£80m> restructuring .
Laurent Beaudoin, part of the family that controls the Montreal-based
firm, will take on <over> the role of CEO under a newly created
management structure . Analysts said <believe> the resignations seem
to have stemmed from a boardroom dispute . Under Mr Tellier's tenure
at the company <subsidy>, which began in January <July> 2003, plans
<according> to cut the worldwide workforce of 75,000 by almost
<signing> a third <movement> by 2006 were announced . The firm's
snowmobile <auto> division and defence services unit were also sold
and Bombardier started the development <future> of a new aircraft
seating 110 to 135 passengers

Mr Tellier had indicated he