In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df=pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
df_business=df[df['labels']=='business']['text']

In [6]:
df_business.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

In [10]:
(df_business.head())

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

In [12]:
document=df_business[0]
sentences=nltk.sent_tokenize(document)
sentences[:4]

['Ad sales boost Time Warner profit\n\nQuarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.',
 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.',
 'TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.',
 'Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.']

In [14]:
vectorizer_tfidf=TfidfVectorizer(stop_words=stopwords.words('english'))

In [17]:
sent_vec=vectorizer_tfidf.fit_transform(sentences)
sent_vec.shape

(20, 191)

In [81]:
def cosine_similarity(sent_vec):
    shape=sent_vec.shape
    sent_vec=sent_vec.toarray()
    cosine_mat=np.zeros((shape[0],shape[0]))
    for i in range(shape[0]):
        for j in range(shape[0]):
            A=sent_vec[i]
            B=sent_vec[j]
            dot_product = np.dot(A, B)
            magnitude_A = np.linalg.norm(A)
            magnitude_B = np.linalg.norm(B)

            cosine_similarity = dot_product / (magnitude_A * magnitude_B)
            cosine_mat[i,j]=cosine_similarity
    return cosine_mat
co_sim_sent=cosine_similarity(sent_vec)
    

In [84]:
co_sim_sent.shape

(20, 20)

In [98]:
co_sim_sent_norm=np.divide( co_sim_sent,co_sim_sent.sum(axis=1,keepdims=True))
co_sim_sent_norm[0].sum()

0.9999999999999998

In [106]:
smoothing_vec=np.ones_like(co_sim_sent_norm)/len(co_sim_sent_norm)
print('shape or smoothing vec',smoothing_vec.shape)
print('')

shape or smoothing vec (20, 20)



In [107]:
factor=0.15
smoothed_cos_sim_norm=(factor)*smoothing_vec+(1-factor)*co_sim_sent_norm

In [116]:
print(smoothed_cos_sim_norm.sum(axis=1))

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [144]:
eigenVal,eigenVec=np.linalg.eig(smoothed_cos_sim_norm.T)

In [145]:
eigenVal

array([1.        , 0.76474126, 0.71847649, 0.70769834, 0.29921287,
       0.67447393, 0.63322051, 0.62131277, 0.59857631, 0.34127887,
       0.35557532, 0.37607029, 0.38616546, 0.55030382, 0.53199406,
       0.50093334, 0.43328974, 0.45469954, 0.44154364, 0.44578085])

In [148]:
#accesing all first column value which correspond to eigen value 1
eigenVec[:,0]

array([-0.26490394, -0.22255411, -0.25588489, -0.23454707, -0.21909577,
       -0.21162511, -0.21194466, -0.25928149, -0.21885458, -0.22178159,
       -0.23822027, -0.19537585, -0.23731512, -0.20001019, -0.20884387,
       -0.22606415, -0.19622201, -0.19163975, -0.23157572, -0.20728264])

In [149]:

eigenVec[:,0].dot(smoothed_cos_sim_norm)

array([-0.26490394, -0.22255411, -0.25588489, -0.23454707, -0.21909577,
       -0.21162511, -0.21194466, -0.25928149, -0.21885458, -0.22178159,
       -0.23822027, -0.19537585, -0.23731512, -0.20001019, -0.20884387,
       -0.22606415, -0.19622201, -0.19163975, -0.23157572, -0.20728264])

In [162]:
# normalizing the distribution  of the eigenVec
normalized_eigenvec=eigenVec[:,0]/eigenVec[:,0].sum()

In [163]:
normalized_eigenvec

array([0.05948857, 0.04997821, 0.05746319, 0.05267143, 0.04920158,
       0.04752392, 0.04759568, 0.05822595, 0.04914742, 0.04980473,
       0.05349631, 0.04387488, 0.05329304, 0.0449156 , 0.04689935,
       0.05076645, 0.0440649 , 0.04303588, 0.05200416, 0.04654875])

In [164]:
sort_idx=np.argsort(-normalized_eigenvec)

for i in sort_idx[:5]:
    print(sentences[i])
    

Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.
However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues.
TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.
Time Warner's fourth quarter profits were slightly better than analysts' expectations.
For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.


In [180]:

def textrank_summarize(sentences):
    tfidf_vec=TfidfVectorizer(stop_words=stopwords.words('english'))
    sent_vect=tfidf_vec.fit_transform(sentences)
    cos_si_sent_vec=cosine_similarity(sent_vect)
    #normalizeing the cosine similarity matrix
    cos_si_sent_vec_norm=np.divide( cos_si_sent_vec,cos_si_sent_vec.sum(axis=1,keepdims=True))
    smoothing_vec=np.ones_like(cos_si_sent_vec_norm)/len(cos_si_sent_vec_norm)
    factor=0.15
    smoothed_cos_sim_norm_vec=(1-factor)*cos_si_sent_vec_norm+factor*smoothing_vec
    eigenval,eigenvec=np.linalg.eig(smoothed_cos_sim_norm_vec.T)
    eigenvec_fV_1=eigenvec[:,0]
    eigenvev_fV_1_norm=eigenvec[:,0]/eigenvec[:,0].sum()
    sorted_idx=np.argsort(-eigenvev_fV_1_norm)
    res=[]
    
    for i in (sorted_idx[:5]):
        res.append(sentences[i])
    return res
    
    
    

In [183]:
i=np.random.choice(len(df_business))
document=df_business[i]
sentences=nltk.sent_tokenize(document)
summarized=textrank_summarize(sentences)
summarized

['The plan is likely to give creditors of Parmalat Finanziaria shares worth about 5.7% of the debts they are owed.',
 'Creditors of Parmalat, the main operating company, are likely to see the percentage of debt they receive fall from 7.3% to 6.9%.',
 'Parmalat to return to stockmarket\n\nParmalat, the Italian dairy company which went bust after an accounting scandal, hopes to be back on the Italian stock exchange in July.',
 "As part of the re-listing on the Italian stock exchange, creditors' debts are expected to be converted into shares through two new share issues amounting to more than 2bn euros.",
 'This is lower than the 11.3% creditors previously hoped to receive.']