In [1]:
import pandas as pd
import numpy as np

In [2]:
# dt = np.dtype([('article', np.unicode_), ('highlights', np.unicode_)])
# data = np.fromfile('data/test.bin', dtype=dt)
# df = pd.DataFrame(data)

### Reading the debateall dataset

In [3]:
data = pd.read_csv('data/debateall.csv', encoding='utf-8')
data['Full-Document'] = data['Full-Document'].str.replace(u"\u00B6", "")
data['Extract'] = data['Extract'].str.replace(u"\u00B6", "")
data.head()

Unnamed: 0,Full-Document,Citation,Extract,Abstract,#CharsDocument,#CharsAbstract,#CharsExtract,#WordsDocument,#WordsAbstract,#WordsExtract,AbsCompressionRatio,ExtCompressionRatio,OriginalDebateFileName
0,German Chancellor Angela Merkel demanded on Th...,O’Donnell and Baker ’13 [JOHN O'DONNELL AND LU...,"Merkel demanded the U S strike a ""no-spying"" a...",Text: The United States should offer to accede...,3155,132,1416,506,22,231,0.043478,0.456522,Bulk Data Collection Negative - JDI 2015.html5
1,Judge Leon last week questioned the effectiven...,"Cohen ’13 [ANDREW COHEN, DEC 27, 2013, The Atl...",Pauley asserted The effectiveness of bulk tele...,Unique link- judicial deference specifically o...,367,132,100,52,22,13,0.423077,0.25,Bulk Data Collection Negative - JDI 2015.html5
2,New York Times July 25th 2013\n(POLITICS Rober...,Rosenthal 6/12/15\n(“Government's Secret Surve...,The recent leaks about government spying progr...,Issues of national surveillance are a key issu...,10386,61,2223,1639,10,355,0.006101,0.216595,Bulk Data Collection Negative - JDI 2015.html5
3,Posner 2007\n(“Terror in the Balance: Security...,New York Times July 25th 2013\n(POLITICS Rober...,"When national emergencies strike, the executiv...",Chief Justice Roberts ensures issues of nation...,1598,99,669,226,14,98,0.061947,0.433628,Bulk Data Collection Negative - JDI 2015.html5
4,Goitien April 17th 2015\n(Appointing Democrati...,Posner 2007\n(“Terror in the Balance: Security...,Chief Justice Roberts recently named two new j...,Courts empirically defer instances of national...,7696,56,1482,1201,7,238,0.005828,0.198168,Bulk Data Collection Negative - JDI 2015.html5


In [4]:
data.loc[0, 'Full-Document']

'German Chancellor Angela Merkel demanded on Thursday that the United States strike a "no-spying" agreement with Berlin and Paris by the end of the year, saying alleged espionage against two of Washington\'s closest EU allies had to be stopped. Speaking after talks with EU leaders that were dominated by allegations that the U.S. National Security Agency had accessed tens of thousands of French phone records and monitored Merkel\'s private mobile phone, the chancellor said she wanted action from President Barack Obama, not just apologetic words. Germany and France would seek a "mutual understanding" with the United States on cooperation between their intelligence agencies, and other EU member states could eventually take part. "That means a framework for cooperation between the relevant (intelligence) services.Germany and France have taken the initiative and other member states will join," she said. In a statement issued after the first day of the summit, the EU\'s 28 leaders said they 

In [5]:
data.loc[0, 'Extract']

'Merkel demanded the U S strike a "no-spying" agreement with Berlin and Paris saying alleged espionage against EU allies had to be stopped the chancellor said she wanted action from Obama Germany and France would seek a "mutual understanding" with the United States on cooperation between their intelligence agencies, and other EU member states could eventually take part the EU\'s 28 leaders said they supported the Franco-German plan Merkel raised the with Obama but nothing came of it The United States has a "no-spying" deal with Britain, Australia, New Zealand and Canada, an alliance known as "Five Eyes" there has traditionally been a reluctance to make similar arrangements with other allies Merkel said an accord with Washington was long overdue The friendship and partnership between the European member states, includingGermany, and the United States is not a one-way street. We depend on it the United States also needs friends in the w As EU leaders arrived for the two-day summit there 

### Learning how to generate embeddings using BERT

In [6]:
# !pip install simpletransformers

In [7]:
from simpletransformers.language_representation import RepresentationModel

In [8]:
sentences = data['Full-Document'][: 1000]
sentences.head()

0    German Chancellor Angela Merkel demanded on Th...
1    Judge Leon last week questioned the effectiven...
2    New York Times July 25th 2013\n(POLITICS Rober...
3    Posner 2007\n(“Terror in the Balance: Security...
4    Goitien April 17th 2015\n(Appointing Democrati...
Name: Full-Document, dtype: object

In [9]:
# import re


In [10]:
summaries = data['Extract'][: 1000]
summaries.head()

0    Merkel demanded the U S strike a "no-spying" a...
1    Pauley asserted The effectiveness of bulk tele...
2    The recent leaks about government spying progr...
3    When national emergencies strike, the executiv...
4    Chief Justice Roberts recently named two new j...
Name: Extract, dtype: object

### Defining a model to use the bert embeddings and set use_cuda = True to use GPU

In [11]:
# sentences = ['Machine Learning and Deep Learning are part of AI', 'Data Science will excel in future']
model = RepresentationModel(
        model_type='bert',
        model_name='bert-base-uncased',
        use_cuda=True
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTextRepresentation: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTextRepresentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTextRepresentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Running K-Means clustering on the embeddings and finding the optimal K

In [24]:
from collections import defaultdict

def findSummary(kMeans, word_vectors, document):
    preds = kMeans.labels_
    centers = kMeans.cluster_centers_
    clusters = defaultdict(list)
    for i in range(len(word_vectors)):
        clusters[preds[i]].append([centers[preds[i]], i, word_vectors[i]])
    cluster_repr = {}
    for center, points in clusters.items():
        dists = []
        for i, point in enumerate(points):
            dists.append([np.linalg.norm(center - point[2]), i, point[2]])
        dists.sort(key=lambda x: x[0])
        cluster_repr[center] = [dists[0][1], dists[0][2]]
    summary = []
    for _, point in cluster_repr.items():
        summary.append([point[0], document[point[0]]])
    summary.sort()
    return [sentence[1] for sentence in summary]

In [37]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import re
from sklearn.metrics import hamming_loss, accuracy_score, silhouette_score
import torch.nn as nn

predicted_summaries = open('predictions.txt', 'w')
for i in range(1000):
    print(f'\rRunning for i = {i + 1} document', sep='')
    distortions = []
    inertias = []
    mapping1 = {}
    mapping2 = {}
    document = re.sub('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', '\n', sentences[i]).split('\n')
    word_vectors = model.encode_sentences(document, combine_strategy='mean')
    # nn.LSTM(1, 82, )
    print(word_vectors.shape)
    silhouette_scores = []
    for j in range(2, min(11, len(word_vectors))):
        kMeans = KMeans(n_clusters=j, random_state=42).fit(word_vectors)
        print(word_vectors.shape, kMeans.labels_.shape, kMeans.labels_, silhouette_score(word_vectors, kMeans.labels_))
        silhouette_scores.append(silhouette_score(word_vectors, kMeans.labels_))
        # distortions.append(sum(np.min(cdist(word_vectors, kMeans.cluster_centers_,
        #                                     'euclidean'), axis=1)) / word_vectors.shape[0])
        # inertias.append(kMeans.inertia_)
        # mapping1[j] = sum(np.min(cdist(word_vectors, kMeans.cluster_centers_,
        #                             'euclidean'), axis=1)) / word_vectors.shape[0]
        # mapping2[j] = kMeans.inertia_
    if silhouette_scores != []:
        # best_k = sorted(mapping1.items(), key=lambda x: x[1])[0]
        best_k = np.argmax(silhouette_scores) + 2
        kMeansModel = KMeans(n_clusters=best_k, random_state=42).fit(word_vectors)
        # kMeansModel = KMeans(n_clusters=5, random_state=42).fit(word_vectors)
        print(f'Best k: {best_k}, Length of selected summary: {len(findSummary(kMeansModel, word_vectors, document))}')
    # predicted_summaries.write(str(findSummary(kMeansModel, word_vectors, document)))
    # predicted_summaries.write('\n')
    # predicted_summaries.write(str(summaries[i]))
    # predicted_summaries.write('\n')
    # predicted_summaries.write('\n')
    if i == 4:
        break
predicted_summaries.close()
    # print(f'Predicted Summary: {findSummary(kMeansModel, word_vectors, document)}')
    # print(f'Actual Summary: {summaries[i]}')

Running for i = 1 document
(19, 768)
(19, 768) (19,) [0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0] 0.4048729
(19, 768) (19,) [1 1 1 0 0 1 1 1 1 0 2 1 0 1 2 0 1 0 1] 0.17226088
(19, 768) (19,) [0 1 0 1 1 0 0 0 0 1 2 3 1 0 2 1 0 1 0] 0.15841855
(19, 768) (19,) [0 1 0 1 1 0 0 0 0 1 2 3 1 0 4 1 0 1 0] 0.13263188
(19, 768) (19,) [1 1 0 3 1 0 0 0 0 3 5 4 1 0 2 3 1 3 1] 0.07901874
(19, 768) (19,) [0 0 0 3 3 0 0 0 0 3 2 4 3 1 6 5 0 5 0] 0.10174414
(19, 768) (19,) [1 1 0 4 1 1 1 6 1 4 2 5 3 0 7 4 1 4 1] 0.11002001
(19, 768) (19,) [1 1 4 5 1 1 1 8 4 5 6 3 0 7 2 5 1 5 1] 0.09663949
(19, 768) (19,) [0 0 6 2 0 0 9 7 0 2 4 3 5 8 1 2 0 2 0] 0.07734692
Best k: 2, Length of selected summary: 2
Running for i = 2 document
(1, 768)
Running for i = 3 document
(70, 768)
(70, 768) (70,) [1 1 1 0 1 1 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 0
 0 1 1 1 0 1 1 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1] 0.45677018
(70, 768) (70,) [2 1 1 2 1 1 1 0 0 1 1 1 1 0 2 0 2 1 1 1 1 1 1 1 1 2 1 1 1 1 