In [1]:
from nltk.tokenize import sent_tokenize
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import numpy as np
import language_check
import torch
import math
import json 
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def bertSent_embeding(sentences):

    ## Add sentence head and tail as BERT requested
    marked_sent = ["[CLS] " +item + " [SEP]" for item in sentences]
    
    ## USE Bert tokenizization 
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized_sent = [tokenizer.tokenize(item) for item in marked_sent]
    
    ## index to BERT vocabulary
    indexed_tokens = [tokenizer.convert_tokens_to_ids(item) for item in tokenized_sent]
    tokens_tensor = [torch.tensor([item]) for item in indexed_tokens]
    
    ## add segment id as BERT requested
    segments_ids = [[1] * len(item) for ind,item in enumerate(tokenized_sent)]
    segments_tensors = [torch.tensor([item]) for item in segments_ids]
    
    ## load BERT base model and set to evaluation mode
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    bert_model.eval()
    
    ## Output 12 layers of latent vector
    assert len(tokens_tensor) == len(segments_tensors)
    encoded_layers_list = []
    for i in range(len(tokens_tensor)):
        with torch.no_grad():
            encoded_layers, _ = bert_model(tokens_tensor[i], segments_tensors[i])
        encoded_layers_list.append(encoded_layers)
    
    ## Use only the last layer vetcor, other choice available
    token_vecs_list = [layers[11][0] for layers in encoded_layers_list]
    
    ## Pooling word vector to sentence vector, use mean pooling, other choice available
    sentence_embedding_list = [torch.mean(vec, dim=0).numpy() for vec in token_vecs_list]
    
    
    
    return sentence_embedding_list

In [3]:
def kmeans_sumIndex(sentence_embedding_list):
    
    n_clusters = np.ceil(len(sentence_embedding_list)**0.5)
    kmeans = KMeans(n_clusters=int(n_clusters))
    kmeans = kmeans.fit(sentence_embedding_list)
    
    sum_index,_ = pairwise_distances_argmin_min(kmeans.cluster_centers_, sentence_embedding_list,metric='euclidean')
    
    sum_index = sorted(sum_index)
    
    return sum_index

In [4]:
def bertSummarize(text):
    
    sentences = sent_tokenize(text)
    #print(sentences)
    sentence_embedding_list = bertSent_embeding(sentences)
    #print(sentence_embedding_list)
    #print("hi")
    sum_index = kmeans_sumIndex(sentence_embedding_list)
    summary = ' '.join([sentences[ind] for ind in sum_index])
    
    return summary

In [5]:
text ="""Hello this is Balaji Pado from the team's parking newbies with my teammates exam Pareek and Kishan Pathani and not topic for project is sentiment analysis of covid-19 tweets visualisation dashpot. So this is the URL of a project which is made using flask and other tools so my want to the first tape that it takes so here person can manually type in the sentiment of any other statements and then there is a prediction of that statement whether it is positive or negative so let's. Let's I am Corona positive patients see the statement is actually negative and it is also the negative symbol is it means that this model is a having a good accuracy and prediction by this modern is of symmetry near 200 which is a good score. So what we are doing in next part is that we are scraping live tweets and then will predict the sentiment of those trips. So how will do that is will search for covid-19 tweets will scrape the twits and Infratech and we can see there are three sections the tweet the time and date and the sentiment of the twits. So here we have also add add one more unique feature that is text to audio so it will convert the result in audio. Let's scraper life that we can see it that way it is script this is a live to it and will when will click on text to audio will use gtts and the same script to it will be converted to audio order food. In terms of the Desolation part it consists of the live graph graph update daily on clicking the refresh button. Once you click the refresh button the code runs in the back and forth in the grass is updated on completely remove wrinkles on the screen the dashboard is displayed of debit card on refreshing as I have clicked this it would show the live updates itself and now you can see that the life graphics. silvarpatti of the negative reactions of the people that is negative weight while the golden path is of depositing and there in this graph the Blue button positive with the red part is for the negative it. In this dashboard Park again this is the mode It is there is the difference of whs dashboard and the billiard sentence about us and it also contains the contact us feature and also our profile of various social media effect. This whole web page is designed using the HTML as well as the part which book particularly is the contribution in our project by that member. So this is all about of sentiment analysis project. Thank you """

In [6]:
def extractive_summary(text) ->str:
    a=bertSummarize(text)
    tool = language_check.LanguageTool('en-US')
    matches1 = tool.check(a)
    return language_check.correct(a, matches1)

In [20]:
def abstractive_summary(text) -> str:
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    device = torch.device('cpu')
    preprocess_text = text.strip().replace("\n","")
    t5_prepared_Text = "summarize: "+preprocess_text
    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)
    # summmarize 
    summary_ids = model.generate(tokenized_text,num_beams=3,no_repeat_ngram_size=1,min_length=50, max_length=100,early_stopping=True)
    tool = language_check.LanguageTool('en-US')
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    matches2 = tool.check(output)
    return language_check.correct(output, matches2)

In [21]:
abstractive=abstractive_summary(text)
extractive =extractive_summary(text)

Some weights of the model checkpoint at t5-small were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


[1, 3, 9, 13]


In [22]:
print("\nExtractive Summary: ")
print(extractive)
print("\nAbstractive Summary: ")
print(abstractive)


Extractive Summary: 
So this is the URL of a project which is made using flask and other tools so my want to the first tape that it takes so here person can manually type in the sentiment of any other statements and then there is a prediction of that statement whether it is positive or negative so lets. So what we are doing in next part is that we are scraping live tweets and then will predict the sentiment of those trips. Silvarpatti of the negative reactions of the people that is negative weight while the golden path is of depositing and there in this graph the Blue button positive with the red part is for the negative it. Thank you

Abstractive Summary: 
Sentiment analysis of covid-19 tweets visualization dash pot is topic for project. So we are scraping live twitter and then will predict that statement, whether it was positive or negative in the next part to be used by people who have been on social media sites since 2000-2001 as an example from our own team with my teammates exam

In [30]:
file1 = open("myfile1.txt","w+",encoding="latin-1") 

In [31]:
l=[]
b=""
j=0
for i in extractive:
    j=j+1
    if j==130:
        b=b+i
        l.append(b)
        l.append("\n")
        b=""
        j=1
    else:
        b=b+i
l.append(b)
file1.writelines(l)
file1.close()

In [32]:
file2 = open("myfile2.txt","w+",encoding="latin-1") 

In [33]:
l=[]
b=""
j=0
for i in abstractive:
    j=j+1
    if j==130:
        b=b+i
        l.append(b)
        l.append("\n")
        b=""
        j=1
    else:
        b=b+i
l.append(b)
file2.writelines(l)
file2.close()

In [34]:
# Python program to convert 
# text file to pdf file 


from fpdf import FPDF 

# save FPDF() class into 
# a variable pdf 
pdf = FPDF() 

# Add a page 
pdf.add_page() 

# set style and size of font 
# that you want in the pdf 
f = open("myfile1.txt", "r",encoding="latin-1") 

pdf.set_font("Arial","U", size = 20) 

# open the text file in read mode 

pdf.set_text_color(255,0,0) 
pdf.cell(200, 10, txt = "Smart Meet", ln = 1, align = 'C') 

pdf.set_text_color(0,0,0) 

pdf.set_font("Times", size = 10)
pdf.cell(200, 10, txt = "----------------------------------------------------------------------------------------------------------", ln = 3, align = 'C') 
pdf.set_font("Arial","B", size = 15)
pdf.cell(200, 10, txt = "Short Summary of your meet !!!", ln = 5, align = 'C') 

# insert the texts in pdf 
pdf.set_font("Times", size = 10)
for x in f: 
    pdf.cell(200, 10, txt = x, ln = 7, align = 'J') 
# save the pdf with name .pdf

pdf.set_font("Times", size = 10)
pdf.cell(200, 10, txt = "-----------------------------------------------------", ln = 9, align = 'C') 
pdf.set_font("Arial","B", size = 15)
pdf.add_page()
pdf.cell(200, 10, txt = "AI based minutes of your meet !!!", ln = 11, align = 'C') 
pdf.set_font("Times", size = 10)

f = open("myfile2.txt", "r",encoding="latin-1") 
for x in f: 
    pdf.cell(200, 10, txt = x, ln = 13, align = 'J')

pdf.cell(200, 10, txt = "-----------------------------------------------------", ln = 15, align = 'C') 
pdf.output("Smart Meet.pdf") 


''

In [None]:
from pdf_mail import sendpdf 
  
# Taking input of following values 
# ex-"abcd@gmail.com"  
sender_email_address = "balajipadamwarp@gmail.com"
  
# ex-"xyz@gmail.com"  
receiver_email_address = input()  
  
# ex-" abcd1412"  
sender_email_password = "**********" 
  
# ex-"Heading of email" 
subject_of_email = "Summary by Smart Meet software"     
   
# ex-" Matter to be sent" 
body_of_email = "Please go through the attachment!" 
   
# ex-"Name of file"  
filename = "Smart Meet"         
  
# ex-"C:/Users / Vasu Gupta/ " 
location_of_file = "C:/Users/Balaji Padamwar"  
  
  
# Create an object of sendpdf function  
k = sendpdf(sender_email_address,  
            receiver_email_address, 
            sender_email_password, 
            subject_of_email, 
            body_of_email, 
            filename, 
            location_of_file) 
  
# sending an email 
k.email_send()