In [8]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [9]:
LANGUAGE = "english"
SENTENCES_COUNT = 7
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

In [10]:
if __name__ == "__main__":
    url = "https://www.ndtv.com/india-news/bjp-exploring-possibility-of-simultaneous-polls-in-11-states-in-2019-1899962?pfrom=home-livetv"
    parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    # or for plain text files
    # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    
    counter = 1
    corpus = []
    corpus_new = []
    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(counter , ") ",  sentence)
        corpus.append(str(sentence).split(" "))
        corpus_new.append(str(sentence))
        
        word_tokens = word_tokenize(str(sentence))
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        filtered_sentence1 = ""
 
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence1 += str(w) + " "
 
        print(counter , ") ",  filtered_sentence1)
        print("-------")
        
        counter += 1
        

1 )  A top party leader, who spoke on the condition of anonymity, said they are exploring the possibility of deferring assembly elections of some states by a few months and advancing some others to ensure they are held simultaneously with the Lok Sabha polls.
1 )  A top party leader spoke condition anonymity said exploring possibility deferring assembly elections states months advancing others ensure held simultaneously Lok Sabha polls 
-------
2 )  Assembly elections in Andhra Pradesh, Odisha and Telangana are anyway scheduled to be held with the parliamentary polls, he said.
2 )  Assembly elections Andhra Pradesh Odisha Telangana anyway scheduled held parliamentary polls said 
-------
3 )  Another state where talks of advancing assembly elections are doing the rounds is Bihar, where they are due in 2020 end.
3 )  Another state talks advancing assembly elections rounds Bihar due 2020 end 
-------
4 )  There is a view within the BJP that holding assembly elections of as many states as 

In [11]:
print (corpus)

[['A', 'top', 'party', 'leader,', 'who', 'spoke', 'on', 'the', 'condition', 'of', 'anonymity,', 'said', 'they', 'are', 'exploring', 'the', 'possibility', 'of', 'deferring', 'assembly', 'elections', 'of', 'some', 'states', 'by', 'a', 'few', 'months', 'and', 'advancing', 'some', 'others', 'to', 'ensure', 'they', 'are', 'held', 'simultaneously', 'with', 'the', 'Lok', 'Sabha', 'polls.'], ['Assembly', 'elections', 'in', 'Andhra', 'Pradesh,', 'Odisha', 'and', 'Telangana', 'are', 'anyway', 'scheduled', 'to', 'be', 'held', 'with', 'the', 'parliamentary', 'polls,', 'he', 'said.'], ['Another', 'state', 'where', 'talks', 'of', 'advancing', 'assembly', 'elections', 'are', 'doing', 'the', 'rounds', 'is', 'Bihar,', 'where', 'they', 'are', 'due', 'in', '2020', 'end.'], ['There', 'is', 'a', 'view', 'within', 'the', 'BJP', 'that', 'holding', 'assembly', 'elections', 'of', 'as', 'many', 'states', 'as', 'possible', 'with', 'the', 'Lok', 'Sabha', 'polls', 'will', 'be', 'a', 'positive', 'plank', 'in', 'its

In [12]:
from gensim.models import Word2Vec
import multiprocessing 

model = Word2Vec(corpus, size=20, window=10, min_count=1, workers=multiprocessing.cpu_count(), sample = 1e-3, seed=1)             
words = list(model.wv.vocab)
print(words)

['A', 'top', 'party', 'leader,', 'who', 'spoke', 'on', 'the', 'condition', 'of', 'anonymity,', 'said', 'they', 'are', 'exploring', 'possibility', 'deferring', 'assembly', 'elections', 'some', 'states', 'by', 'a', 'few', 'months', 'and', 'advancing', 'others', 'to', 'ensure', 'held', 'simultaneously', 'with', 'Lok', 'Sabha', 'polls.', 'Assembly', 'in', 'Andhra', 'Pradesh,', 'Odisha', 'Telangana', 'anyway', 'scheduled', 'be', 'parliamentary', 'polls,', 'he', 'said.', 'Another', 'state', 'where', 'talks', 'doing', 'rounds', 'is', 'Bihar,', 'due', '2020', 'end.', 'There', 'view', 'within', 'BJP', 'that', 'holding', 'as', 'many', 'possible', 'polls', 'will', 'positive', 'plank', 'its', 'favour', 'Prime', 'Minister', 'Narendra', 'Modi', 'has', 'repeatedly', 'emphasised', 'his', 'support', 'idea.', 'With', 'opposition', 'parties', 'like', 'Congress', 'being', 'against', 'idea,', 'government', 'not', 'position', 'effect', 'constitutional', 'amendment', 'exercise', 'would', 'require.', 'By', 'c

In [13]:
print(model['BJP'])

[ 0.01034851  0.01105697 -0.01922357  0.01441387  0.007324    0.01160606
  0.01114569 -0.0227556  -0.00512031  0.00748788 -0.0094018  -0.02438186
  0.0097628  -0.01747913 -0.01251031  0.02288307  0.01346745 -0.0127386
 -0.00168333 -0.00165065]


  """Entry point for launching an IPython kernel.


In [14]:
model.save('model.bin')
new_model = Word2Vec.load('model.bin')
print(new_model['BJP'])

[ 0.01034851  0.01105697 -0.01922357  0.01441387  0.007324    0.01160606
  0.01114569 -0.0227556  -0.00512031  0.00748788 -0.0094018  -0.02438186
  0.0097628  -0.01747913 -0.01251031  0.02288307  0.01346745 -0.0127386
 -0.00168333 -0.00165065]


  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
from sklearn.decomposition import PCA
from matplotlib import pyplot

X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

  after removing the cwd from sys.path.


<Figure size 640x480 with 1 Axes>

1) We can only tarin on good news as they are easily available. 
2) If fake news is needed then vectors of good news can be adulterated.
3) Fake news can be generated using GAN (generative adverisal network).

4) Vectorise news - 
    4.1) From google api get current hot news
    4.2) Sumarise them to reduce the input vector size.
    4.3) Vectorise them using google Woed2Vec. (Can train it on my own corpus)
    
5) Assume each news article as one large vector that will go as an input to the LSTM.
6) So, each word will have n-number of features from Word2Vec.
7) Therefor the input ector size wil be sizeof_sentence * n.
8) The input vector size must be constant as sizeof_sentense must be constant.

9) To get one size for multiple documents it can be done rndomly or based on some statistics. 

In [16]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing

In [17]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
model = Doc2Vec(documents,
                vector_size=20,
                window=5,
                min_count=1,
                workers=multiprocessing.cpu_count(),
                sample=1e-5,seed=1)

In [18]:
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("my_doc2vec_model")
model.save(fname)
model = Doc2Vec.load(fname)  # you can continue training with the loaded model!

In [19]:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
vector = model.infer_vector(corpus[5])

In [20]:
print(vector)

[ 0.0111726  -0.02167665 -0.00331736 -0.01955061  0.01728925 -0.02252975
  0.00612714  0.00746935 -0.00019651 -0.02375371 -0.01743418  0.02206131
 -0.02464705  0.01257668  0.01478318 -0.00874821 -0.00919097  0.0100348
  0.01369012 -0.02110121]
