In [1]:
import torch
from transformers import BartForConditionalGeneration
from transformers.modeling_outputs import BaseModelOutput


class BartForMultiConditionalGeneration(BartForConditionalGeneration):

    def multi_encode(
        self,
        input_ids=None,
        attention_mask=None,
        return_dict=None
    ):
        # (B, N, L) -> (B*N, L) -> (B*N, L, D) -> (B, N*L, D)
        # (B, N, L) -> (B*N, L) -> (B, N*L)
        B = input_ids.size(0)  # batch-size
        N = input_ids.size(1)  # num-docs
        L = input_ids.size(2)  # max_len
        if input_ids.size() != attention_mask.size():
            raise ValueError(
                f"Input ids different shape ({input_ids.size()}) than attention mask ({attention_mask.size()})"
            )
        input_ids = input_ids.contiguous().view(B * N, L)
        attention_mask = attention_mask.contiguous().view(B * N, L)
        encoder_outputs = self.model.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=return_dict
        )
        if return_dict:
            hidden_states = encoder_outputs.last_hidden_state
        else:
            hidden_states = encoder_outputs[0]
        # hidden_states: (B * N, L, D)
        D = hidden_states.size(2)
        stacked_source_reps = hidden_states.contiguous().view(B, N * L, D)
        if return_dict:
            encoder_outputs = BaseModelOutput(last_hidden_state=stacked_source_reps)
        else:
            encoder_outputs = (stacked_source_reps,)
        stacked_source_mask = attention_mask.contiguous().view(B, N * L)
        return encoder_outputs, stacked_source_mask

    @torch.no_grad()
    def generate(
        self,
        input_ids=None,
        attention_mask=None,
        **kwargs,
    ):
        encoder_outputs, attention_mask = self.multi_encode(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        return super().generate(
            input_ids=None,
            attention_mask=attention_mask,
            encoder_outputs=encoder_outputs,
            **kwargs
        )

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        decoder_input_ids=None,
        decoder_attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        encoder_outputs=None,
        past_key_values=None,
        inputs_embeds=None,
        decoder_inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None
    ):

        if input_ids is None:
            if encoder_outputs is None:
                raise ValueError("Encoder outputs is required when no input ids passed")
        else:
            encoder_outputs, attention_mask = self.multi_encode(
                input_ids=input_ids,
                attention_mask=attention_mask,
                return_dict = return_dict
                # encoder_outputs=encoder_outputs
            )

        output = super().forward(
            input_ids=None,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            encoder_outputs=encoder_outputs,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            labels=labels,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )

        return output

In [2]:
import re

In [3]:
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)

In [4]:
config = AutoConfig.from_pretrained("segenc-qmsum-16384-512-wikisum-1")

In [5]:
model_class = AutoModelForSeq2SeqLM
model = model_class.from_pretrained("segenc-qmsum-16384-512-wikisum-1",config = config)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("segenc-qmsum-16384-512-wikisum-1",use_auth_token=True)

In [7]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50265, 1024, padding_idx=1)

In [8]:
model.config.decoder_start_token_id

2

In [9]:
key_yr_chap_mp = {'Karamchand_Gandhi_1869': ['1\\chapter1'],
             'Alias_Kaba_Gandhi_1869': ['1\\chapter1'],
             'Uttamchand_Gandhi_1869': ['1\\chapter1'],
             'Faithfulness_1869': ['1\\chapter4'],
             'Smoking_Stumps_1887': ['1\\chapter7'],
             'Cigarettes_1887': ['1\\chapter7'],
             'Cigarette_1887': ['1\\chapter7'],
             'Vaishnava_Faith_1887': ['1\\chapter9'],
             'Ahmedabad_1887': ['1\\chapter10'],
             'Ahmedabad_1915': ['5\\chapter9', '5\\chapter20'],
             'Matriculation_Examination_1887': ['1\\chapter10'],
             'Kathiawad_Students_1887': ['1\\chapter10'],
             'Ringworm_1887': ['1\\chapter13'],
             'Vegetarianism_1887': ['1\\chapter14'],
             'Vegetarianism_1890': ['1\\chapter16'],
             'Vegetarianism_1903': ['4\\chapter6'],
             'Dietary_1887': ['1\\chapter14'],
             'Diet_1887': ['1\\chapter14'],
             'Vegetarian_1890': ['1\\chapter16'],
             'Narayan_Hemchandra_1890': ['1\\chapter21'],
             'Time_Narayan_Hemchandra_1890': ['1\\chapter21'],
             'Narayan_1890': ['1\\chapter21'],
             'Eiffel_Tower_1890': ['1\\chapter22'],
             'Legal_Maxims_1857': ['1\\chapter24'],
             'Briefless_Barristers_1893': ['2\\chapter3'],
             'Barrister_1893': ['2\\chapter3', '2\\chapter4'],
             'Barrister_1885': ['2\\chapter14'],
             'Barrister_1890': ['4\\chapter46'],
             'Barristers_1893': ['2\\chapter3'],
             'Bombay_Commissions_1893': ['2\\chapter4'],
             'Port_Natal_1893': ['2\\chapter7'],
             'Christian_Indians_1924': ['2\\chapter8'],
             'Prayers_1893': ['2\\chapter11'],
             'Prayer_1893': ['2\\chapter11'],
             'Prayer_1885': ['2\\chapter15'],
             'Transvaal_1885': ['2\\chapter13'],
             'Transvaal_1920': ['3\\chapter4'],
             'Transvaal_1903': ['4\\chapter2'],
             'Transvaal_1890': ['4\\chapter46'],
             'Litigation_1885': ['2\\chapter14'],
             'Natal_Legislative_Assembly_1893': ['2\\chapter16'],
             'Durban_1893': ['2\\chapter16'],
             'Sheth_Haji_Muhammad_1893': ['2\\chapter17'],
             'Sheth_Haji_Muhammad_Haji_Dada_1893': ['2\\chapter17'],
             'Financially_Sheth_Abdulla_Haji_1893': ['2\\chapter17'],
             'Bombay_High_Court_1893': ['2\\chapter18'],
             'Court_1893': ['2\\chapter18'],
             'Natal_Indian_Congress_1894': ['2\\chapter19', '2\\chapter20'],
             'Natal_Indians_1860': ['2\\chapter21'],
             'Natal_Zulus_1860': ['2\\chapter21'],
             'Taxation_1860': ['2\\chapter21'],
             'Servant_1896': ['2\\chapter23'],
             'Household_1896': ['2\\chapter23'],
             'Loyalty_1896': ['2\\chapter26'],
             'Sir_Pherozeshah_1896': ['2\\chapter28'],
             'Calcutta_1896': ['2\\chapter29'],
             'Raja_Sir_Pyarimohan_Mukarji_1896': ['2\\chapter29'],
             'Bengal_Club_1896': ['2\\chapter29'],
             'Hindu_Wife_1920': ['3\\chapter1'],
             'Hindu_Husband_1920': ['3\\chapter1'],
             'Quarantine_1920': ['3\\chapter2'],
             'Quarantine_Officer_1920': ['3\\chapter2'],
             'Plague_1920': ['3\\chapter2'],
             'Brahmacharya_Vow_1906': ['3\\chapter7', '3\\chapter8'],
             'Brahmacharya_1906': ['3\\chapter7'],
             'Brahmacharya_1912': ['4\\chapter30'],
             'Vow_1906': ['3\\chapter8'],
             'Washing_Outfit_1906': ['3\\chapter9'],
             'Laundry_1906': ['3\\chapter9'],
             'Washing_1906': ['3\\chapter9'],
             'Boer_War_1899': ['3\\chapter10'],
             'Boer_War_1914': ['4\\chapter39'],
             'Sir_Pherozeshah_Mehta_1901': ['3\\chapter13'],
             'Babu_Bhupendranath_Basu_1901': ['3\\chapter14'],
             'India_Club_1901': ['3\\chapter16'],
             'Professor_Ray_1896': ['3\\chapter17'],
             'Gokhlae_1896': ['3\\chapter18'],
             'Christian_Indian_1896': ['3\\chapter18'],
             'Babu_Kalicharan_Banerji_1896': ['3\\chapter18'],
             'Brahmo_Samaj_1896': ['3\\chapter19'],
             'Adi_Brahmo_Samaj_1896': ['3\\chapter19'],
             'Maharshi_Devendranath_1896': ['3\\chapter19'],
             'Indian_Evidence_Act_1919': ['3\\chapter21'],
             'Insult_1903': ['4\\chapter3'],
             'Hindu_Influence_1903': ['4\\chapter5'],
             'Hinduism_1903': ['4\\chapter5'],
             'Hindu_1903': ['4\\chapter5'],
             'Laxative_Medicines_1903': ['4\\chapter7'],
             'Laxatives_1903': ['4\\chapter7'],
             'Indian_Opinion_1900': ['4\\chapter8'],
             'Indian_Opinion_1904': ['4\\chapter19'],
             'Hindus_1904': ['4\\chapter14'],
             'Zulus_1900': ['4\\chapter25'],
             'Zulu_1900': ['4\\chapter25'],
             'Satyagraha_1900': ['4\\chapter26'],
             'Satyagraha_1919': ['5\\chapter24'],
             'Word_Sadagraha_1900': ['4\\chapter26'],
             'Result_Maganlal_Gandhi_1900': ['4\\chapter26'],
             'Curry_Powder_1914': ['4\\chapter29'],
             'Fasting_1912': ['4\\chapter31'],
             'Tolstoy_Farm_1911': ['4\\chapter35'],
             'Commanding_Officer_1890': ['4\\chapter40'],
             'Commanding_1890': ['4\\chapter40'],
             'Parsi_Rustomji_1890': ['4\\chapter47'],
             'Bombay_Gokhale_1915': ['5\\chapter2'],
             'Kakasaheb_1915': ['5\\chapter4'],
             'Kalelkar_1915': ['5\\chapter4'],
             'Teachers_Family_Names_1915': ['5\\chapter4'],
             'Acharya_Ramadevji_1915': ['5\\chapter8'],
             'Gurukul_1915': ['5\\chapter8'],
             'Gujarat_1915': ['5\\chapter9'],
             'Satyagraha_Ashram_1915': ['5\\chapter9'],
             'Amritlal_Thakkar_1915': ['5\\chapter10'],
             'Ashram_1915': ['5\\chapter10', '5\\chapter21'],
             'Abolition_1894': ['5\\chapter11'],
             'Indigo_1917': ['5\\chapter12'],
             'Professor_Kripalani_1915': ['5\\chapter13'],
             'Maulana_Mazharul_Haq_1915': ['5\\chapter13'],
             'Champaran_Inquiry_1915': ['5\\chapter16'],
             'Champaran_Satyagraha_1915': ['5\\chapter16'],
             'Rajendrababu_1915': ['5\\chapter17'],
             'Janakdharibabu_1915': ['5\\chapter17'],
             'Gangadharrao_Deshpande_1915': ['5\\chapter17'],
             'Sanitation_Work_1915': ['5\\chapter18'],
             'Sanitation_1915': ['5\\chapter18'],
             'Treatment_1915': ['5\\chapter18'],
             'Bihar_Government_1915': ['5\\chapter19'],
             'Ashram_Walls_1915': ['5\\chapter21'],
             'Ashram_Children_1915': ['5\\chapter21'],
             'Kheda_Satyagraha_1919': ['5\\chapter23'],
             'Mohanlal_Pandya_1919': ['5\\chapter23'],
             'Shankarlal_Parikh_1919': ['5\\chapter23'],
             'Satyagrahi_Volunteers_1919': ['5\\chapter24'],
             'Satyagrahi_1919': ['5\\chapter24'],
             'Mamlatdar_1919': ['5\\chapter25'],
             'Hartal_1919': ['5\\chapter31'],
             'Swami_Shraddhanandji_1919': ['5\\chapter31'],
             'Meanwhile_Delhi_1919': ['5\\chapter31'],
             'Kheda_Satyagraha_Struggle_1919': ['5\\chapter33'],
             'Jalianwala_Bagh_Tragedy_1919': ['5\\chapter34'],
             'Punjab_Situation_1919': ['5\\chapter34'],
             'Punjab_Government_1917': ['5\\chapter35', '5\\chapter37'],
             'Jalianwala_Bagh_Massacre_1917': ['5\\chapter35', '5\\chapter38'],
             'Punjab_Leaders_1917': ['5\\chapter35'],
             'Pandit_Motilal_Nehru_1917': ['5\\chapter37'],
             'Pandit_1917': ['5\\chapter37'],
             'Slivers_1917': ['5\\chapter40'],
             'Swadeshi_Agitation_1920': ['5\\chapter41'],
             'Swadeshi_Movement_1920': ['5\\chapter41'],
             'Khilafat_Agitation_1920': ['5\\chapter42']}

In [14]:
filtered_keywords = ['Vegetarianism_1890']
mp = {}
for f in filtered_keywords:
    mp[f] = key_yr_chap_mp[f]
print(mp)

{'Vegetarianism_1890': ['1\\chapter16']}


In [15]:
def chapter_inputs(key):
    file_loc = r"C:\Users\hp\OneDrive\Documents\topic_coherence\SOMEWT\part%s.txt"%key
    text = ""
    with open(file_loc) as f:
        text = f.read()
        text = re.sub(r"I ", "Mahatma Gandhi ",text)
        text = re.sub(r" me ", " Mahatma Gandhi ",text)
        text = re.sub(r" my ", " Mahatma Gandhi's ",text)
        text = re.sub(r" My ", " Mahatma Gandhi's ",text)
        text = re.sub(r" myself ", " Mahatma Gandhi ",text)
        text = re.sub(r" me,", "  Mahatma Gandhi,",text)
        text = re.sub(r" me.", "  Mahatma Gandhi.",text)
        text = re.sub(r"we ","they " ,text)
        text = re.sub(r"We ","They " ,text)
            
    return text

In [24]:
inputs = ['Vegetarianism.'+chapter_inputs(mp['Vegetarianism_1890'][0])]

In [25]:
model_inputs = tokenizer(inputs, max_length=512, truncation=True,return_tensors='pt')

In [26]:
# model(model_inputs['input_ids'])
output = model.generate(input_ids=model_inputs["input_ids"])

In [27]:
question = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(output.detach().squeeze()))

In [28]:
question

'</s><s><s><s>Vegetarianism. The writers on vegetarianism had examined the question very minutely, attacking it in its religious, scientific, practical, practical and Mahatma Gandhi.ical aspects. They had also brought out the truth that man eats not for enjoyment but to live. A vegetarian diet was the least expensive. The practical and economic argument they had demonstrated demonstrated that a vegetarian diet</s>'

In [53]:
inputs = chapter_inputs(mp['Satyagraha_1900'][0])+'Satyagraha.'
model_inputs = tokenizer(inputs, max_length=512, truncation=True,return_tensors='pt')
output = model.generate(input_ids=model_inputs["input_ids"])
qfs = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(output.detach().squeeze()))
print(qfs)

</s><s>The principle called Satyagraha came into being before that name was invented. The principle Sadagraha -Sat: truth, Agraha: firmness - won the prize. The word Satyagraha has since become current in Gujarati as a designation for the struggle. The history of this struggle is for all practical purposes a history of the remainder of Mahatma Gandhi's life in South Africa and especially of the experiments with truth in that sub-continent. The text is a major portion of this history in Yeravda jail and finished it after Mahatm Gandhi was released. It was published in Navajivan and subsequently issued in book form. Mahat</s>


In [54]:
filtered_keywords = ['Alias_Kaba_Gandhi_1869',
 'Barrister_1893',
 'Boer_War_1899',
 'Boer_War_1914',
 'Calcutta_1896',
 'Christian_Indians_1924',
 'Dietary_1887',
 'Indian_Opinion_1900',
 'Jalianwala_Bagh_Massacre_1917',
 'Karamchand_Gandhi_1869',
 'Kheda_Satyagraha_1919',
 'Kheda_Satyagraha_Struggle_1919',
 'Maharshi_Devendranath_1896',
 'Mohanlal_Pandya_1919',
 'Pandit_Motilal_Nehru_1917',
 'Punjab_Government_1917',
 'Punjab_Leaders_1917',
 'Punjab_Situation_1919',
 'Satyagraha_1900',
 'Satyagraha_1919',
 'Satyagraha_Ashram_1915',
 'Satyagrahi_Volunteers_1919',
 'Sir_Pherozeshah_Mehta_1901',
 'Uttamchand_Gandhi_1869',
 'Vegetarianism_1887',
 'Vegetarianism_1890']

In [67]:
def qfs(kw):
    inputs = ""
    for ch in mp[kw]:
        inputs += chapter_inputs(ch)+kw[:kw.rindex('_')]
    model_inputs = tokenizer(inputs, truncation=True,return_tensors='pt')
    output = model.generate(input_ids=model_inputs["input_ids"])
    summary = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(output.detach().squeeze()))
    return summary

In [56]:
mp = {}
for f in filtered_keywords:
    mp[f] = key_yr_chap_mp[f]

In [57]:
mp

{'Alias_Kaba_Gandhi_1869': ['1\\chapter1'],
 'Barrister_1893': ['2\\chapter3', '2\\chapter4'],
 'Boer_War_1899': ['3\\chapter10'],
 'Boer_War_1914': ['4\\chapter39'],
 'Calcutta_1896': ['2\\chapter29'],
 'Christian_Indians_1924': ['2\\chapter8'],
 'Dietary_1887': ['1\\chapter14'],
 'Indian_Opinion_1900': ['4\\chapter8'],
 'Jalianwala_Bagh_Massacre_1917': ['5\\chapter35', '5\\chapter38'],
 'Karamchand_Gandhi_1869': ['1\\chapter1'],
 'Kheda_Satyagraha_1919': ['5\\chapter23'],
 'Kheda_Satyagraha_Struggle_1919': ['5\\chapter33'],
 'Maharshi_Devendranath_1896': ['3\\chapter19'],
 'Mohanlal_Pandya_1919': ['5\\chapter23'],
 'Pandit_Motilal_Nehru_1917': ['5\\chapter37'],
 'Punjab_Government_1917': ['5\\chapter35', '5\\chapter37'],
 'Punjab_Leaders_1917': ['5\\chapter35'],
 'Punjab_Situation_1919': ['5\\chapter34'],
 'Satyagraha_1900': ['4\\chapter26'],
 'Satyagraha_1919': ['5\\chapter24'],
 'Satyagraha_Ashram_1915': ['5\\chapter9'],
 'Satyagrahi_Volunteers_1919': ['5\\chapter24'],
 'Sir_Pheroz

In [68]:
mp_doc = {}
for f in filtered_keywords:
    mp_doc[f] = qfs(f)
    print(f"{f}  ->   {mp_doc[f] }")

Alias_Kaba_Gandhi_1869  ->   </s><s>The Gandhis belong to the Bania caste and seem to have been originally grocers. But for three generations, from Mahatma Gandhi's grandfather, they have been Prime Ministers in several Kathiawad States. Uttamchand Gandhi, alias Ota Gandhi, the grandfather, must have been a man of principle. State intrigues compelled him to leave Porbandar, where he was Diwan, and to seek refuge in Junagadh. There he saluted the Nawab with the left hand. Someone, noticing the apparent discourtesy, asked for an explanation, which was thus given: 'The right hand is already pledged to Porbandsar. The left hand is pledged</s>
Barrister_1893  ->   </s><s><s><s>While in Bombay, Mahatma Gandhi began, on the one hand, MahATma Gandhi's study of Indian law and, on another, experiments in dietetics in which Virchand Gandhi, a friend, joined me. The Civil Procedure Code and Evidence Act could in no way get on with. Not so however with the Evidence Act and Mayne's Hindu Law with de

Mohanlal_Pandya_1919  ->   </s><s>Sjts. Mohanlal Pandya and Shankarlal Parikh had also thrown themselves into the fight, and had set up an agitation in the Bombay Legislative Council through Sjt. Vithalbhai Patel and the late Sir Gokuldas Kahandas Parekh. SJts. Vallabhbhai, Shankarl Lal Banker, Shrimati Anasuyabehn, Indulal Yajnik, Mahadev Desai and others had joined the struggle.</s>
Pandit_Motilal_Nehru_1917  ->   </s><s>Pandit Motilal Nehru, who, at the sacrifice of his splendid practice, had made the Punjab his headquarters and had done great service, was the President of the Congress; the late Swami Shraddhanandji was the Chairman of the Reception Committee. The release of the Ali Brothers and other prisoners too seemed to Mahatma Gandhi to be an auspicious sign. There had to be dealings with Government in that matter. Then similarly there was the Khilafat question. The King's announcement on the new reforms had just been issued and was unsatisfactory to everyone else, but the ref

In [64]:
import json
with open("mp_doc.json", "w") as outfile:
    json.dump(mp_doc, outfile)

In [69]:
import json
with open("mp_doc2.json", "w") as outfile:
    json.dump(mp_doc, outfile)

In [65]:
mp_doc

{'Alias_Kaba_Gandhi_1869': "</s><s>The Gandhis belong to the Bania caste and seem to have been originally grocers. But for three generations, from Mahatma Gandhi's grandfather, they have been Prime Ministers in several Kathiawad States. Uttamchand Gandhi, alias Ota Gandhi, the grandfather, must have been a man of principle. His loyalty to the state was well known. State intrigues compelled him to leave Porbandar, where he was Diwan, and to seek refuge in Junagadh. There he saluted the Nawab with the left hand. Someone, noticing the apparent discourtesy, asked for an explanation, which was thus given: 'The right hand is already pledged to</s>",
 'Barrister_1893': "</s><s><s><s>While in Bombay, Mahatma Gandhi began, on the one hand, studying of Indian law and experiments in dietetics in which Virchand Gandhi, a friend, joined me. On the other hand, there were two activities simultaneously taking place in Bombay. On one hand there was the one-hand study of Indian Law and, on another, MahA