In [2]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
from collections import Counter
import spacy
from heapq import nlargest
import nltk
from transformers import BartForConditionalGeneration, BartTokenizer
sns.set()

In [3]:
df = pd.read_csv('/kaggle/input/text-train/text-train.csv')
df.head()

Unnamed: 0,id,dialogue,summary,topic
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance


In [4]:
df.shape

(12460, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12460 entries, 0 to 12459
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        12460 non-null  object
 1   dialogue  12460 non-null  object
 2   summary   12460 non-null  object
 3   topic     12460 non-null  object
dtypes: object(4)
memory usage: 389.5+ KB


In [6]:
df.apply(pd.isnull).sum()

id          0
dialogue    0
summary     0
topic       0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df = df[['dialogue', 'topic']]

In [9]:
df.head()

Unnamed: 0,dialogue,topic
0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...",get a check-up
1,"#Person1#: Hello Mrs. Parker, how have you bee...",vaccines
2,"#Person1#: Excuse me, did you see a set of key...",find keys
3,#Person1#: Why didn't you tell me you had a gi...,have a girlfriend
4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",dance


In [10]:
df = df.sample(100)

In [11]:
df.head()

Unnamed: 0,dialogue,topic
3095,"#Person1#: Hi, Mike. I am going to move on the...",move information
3489,"#Person1#: Hello, this is Francis from XYZ Com...",make an appointment
11817,#Person1#: It's lucky that we rode our bike he...,see the match
3747,#Person1#: I hate spring in this city. It's al...,seasons
10155,#Person1#: Why do you want to join us?\n#Perso...,job interview


In [12]:
df.shape

(100, 2)

In [13]:
sw = stopwords.words('english')

In [15]:
df.reset_index(drop = True, inplace = True)

In [16]:
df.head()

Unnamed: 0,dialogue,topic
0,"#Person1#: Hi, Mike. I am going to move on the...",move information
1,"#Person1#: Hello, this is Francis from XYZ Com...",make an appointment
2,#Person1#: It's lucky that we rode our bike he...,see the match
3,#Person1#: I hate spring in this city. It's al...,seasons
4,#Person1#: Why do you want to join us?\n#Perso...,job interview


In [17]:
df['dialogue'][0]

"#Person1#: Hi, Mike. I am going to move on the first of next month.\n#Person2#: Congratulations! So, have you started planning your move?\n#Person1#: Yes, I am trying to find good movers now. Do you know any?\n#Person2#: Not really.\n#Person1#: What did you do for your last move?\n#Person2#: Since I did not have much staff, I asked my cousin to help me out.\n#Person1#: Oh. that's very nice of him.\n#Person2#: Yeah, it was. But I still have some information that I collected last time, and if you want, I will give it to you.\n#Person1#: Yes, please."

In [18]:
df['topic'].value_counts()

topic
daily casual talk         2
music                     2
book a room               2
daily conversation        2
parting gift              1
                         ..
promotion                 1
bicycle                   1
gossip                    1
admiration                1
curriculum institution    1
Name: count, Length: 96, dtype: int64

In [19]:
df['topic'].nunique()

96

In [20]:
def text_cleaning(text):
    text = text.lower()
    text = re.sub('\n', ' ', text)
    text = re.sub('[^a-z0-9]', ' ', text)
    text = re.sub('\s+[a-z]\s+', ' ', text)
    text = re.sub('\s+', ' ', text)
    words = [word for word in text.split() if word not in sw]
    return words

In [21]:
def word_counting(words):
    word_freq = Counter(words)
    max_freq = max(word_freq.values())
    for word, freq in word_freq.items():
        word_freq[word] = freq / max_freq
    return word_freq

In [22]:
def text_summarization(text, num_sents):
    words = text_cleaning(text)
    word_freq = word_counting(words)
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    sentences = [sentence.text for sentence in doc.sents]
    sents_freq = dict()
    for sentence in sentences:
        for word in sentence.split():
            if word in word_freq.keys():
                if sentence not in sents_freq.keys():
                    sents_freq[sentence] = word_freq[word]
                else:
                    sents_freq[sentence] += word_freq[word]
    n_sentences = nlargest(num_sents, sents_freq, key = sents_freq.get)
    final_sentences = []
    for i in sentences:
        for j in n_sentences:
            if i == j:
                final_sentences.append(i)
    return ' '.join(final_sentences)

In [23]:
df['dialogue_summarized'] = df['dialogue'].apply(lambda x: text_summarization(x, 1))

In [24]:
df.head()

Unnamed: 0,dialogue,topic,dialogue_summarized
0,"#Person1#: Hi, Mike. I am going to move on the...",move information,I am going to move on the first of next month.\n
1,"#Person1#: Hello, this is Francis from XYZ Com...",make an appointment,Lets me check my agenda\n#Person1#: Ok.\n#Pers...
2,#Person1#: It's lucky that we rode our bike he...,see the match,#Person1#: It's lucky that we rode our bike he...
3,#Person1#: I hate spring in this city. It's al...,seasons,I like summer very much.\n
4,#Person1#: Why do you want to join us?\n#Perso...,job interview,#Person1#: Do you have any particular conditio...


In [25]:
df['dialogue_summarized'][1]

'Lets me check my agenda\n#Person1#: Ok.\n#Person2#: I am free on Tuesday afternoon from 2 PM to 4 PM, is it convenient for you?\n#Person1#: It is ok with me.'

In [26]:
df['dialogue'][1]

'#Person1#: Hello, this is Francis from XYZ Company. I would like to discuss with you about a new project some time next week. Do you have time to meet?\n#Person2#: Well, I am not sure. Lets me check my agenda\n#Person1#: Ok.\n#Person2#: I am free on Tuesday afternoon from 2 PM to 4 PM, is it convenient for you?\n#Person1#: It is ok with me. So I will be arriving at your office around 2 fifteen PM. Do you have a projector in your room? I would like to show you some related charts\n#Person2#: Yes, there is. See you then.'

In [27]:
df['dialogue_summarized'][3]

'I like summer very much.\n'

In [28]:
df['dialogue'][3]

"#Person1#: I hate spring in this city. It's always raining. We hardly get any sunshine.\n#Person2#: You're right. It's terrible.\n#Person1#: Summer will soon be here. I like summer very much.\n#Person2#: Summer? I can't stand it. It's too hot, especially if you have to take the bus to work like I do.\n#Person1#: Well. I can't imagine that, but I really hate the rain.\n#Person2#: I don't mind it really.\n#Person1#: But summer in Qingdao is lovely, especially the beach. I usually spend the summer there. Peter and I go there every summer. You can come with us next time, Brian. I'm sure you will enjoy lying on the beach very much.\n#Person2#: No, thanks. I prefer a more active holiday.\n#Person1#: For example?\n#Person2#: Well, I was thinking of climbing mountain Huang.\n#Person1#: Climb the mountain? Oh, after a day of climbing, you will be very tired and won't want like to do anything.\n#Person2#: Perhaps, but I don't mind."

In [29]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [30]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') 

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [31]:
def tokenized_text(text):
    return tokenizer(text, max_length = 1024, return_tensors = 'pt', truncation = True)

In [32]:
token_text = df['dialogue'].map(tokenized_text)

In [33]:
token_text[0]

{'input_ids': tensor([[    0, 10431, 41761,   134, 10431,    35, 12289,     6,  1483,     4,
            38,   524,   164,     7,   517,    15,     5,    78,     9,   220,
           353,     4, 50118, 10431, 41761,   176, 10431,    35, 24953,   328,
           407,     6,    33,    47,   554,  1884,   110,   517,   116, 50118,
         10431, 41761,   134, 10431,    35,  3216,     6,    38,   524,   667,
             7,   465,   205,  7458,  3697,   122,     4,  1832,    47,   216,
           143,   116, 50118, 10431, 41761,   176, 10431,    35,  1491,   269,
             4, 50118, 10431, 41761,   134, 10431,    35,   653,   222,    47,
           109,    13,   110,    94,   517,   116, 50118, 10431, 41761,   176,
         10431,    35,  1773,    38,   222,    45,    33,   203,   813,     6,
            38,   553,   127, 11204,     7,   244,   162,    66,     4, 50118,
         10431, 41761,   134, 10431,    35,  5534,     4,    14,    18,   182,
          2579,     9,   123,     4, 5

In [34]:
def ids_ext(item):
    return item['input_ids']

In [35]:
in_ids = token_text.map(ids_ext)

In [36]:
in_ids[0]

tensor([[    0, 10431, 41761,   134, 10431,    35, 12289,     6,  1483,     4,
            38,   524,   164,     7,   517,    15,     5,    78,     9,   220,
           353,     4, 50118, 10431, 41761,   176, 10431,    35, 24953,   328,
           407,     6,    33,    47,   554,  1884,   110,   517,   116, 50118,
         10431, 41761,   134, 10431,    35,  3216,     6,    38,   524,   667,
             7,   465,   205,  7458,  3697,   122,     4,  1832,    47,   216,
           143,   116, 50118, 10431, 41761,   176, 10431,    35,  1491,   269,
             4, 50118, 10431, 41761,   134, 10431,    35,   653,   222,    47,
           109,    13,   110,    94,   517,   116, 50118, 10431, 41761,   176,
         10431,    35,  1773,    38,   222,    45,    33,   203,   813,     6,
            38,   553,   127, 11204,     7,   244,   162,    66,     4, 50118,
         10431, 41761,   134, 10431,    35,  5534,     4,    14,    18,   182,
          2579,     9,   123,     4, 50118, 10431, 4

In [37]:
summary_ids = model.generate(in_ids[0], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

In [38]:
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [39]:
print(summary)

I am going to move on the first of next month. I am trying to find good movers now. Do you know any? Not really. But I still have some information that I collected last time, and if you want, I will give it to you.


In [40]:
df['dialogue'][0]

"#Person1#: Hi, Mike. I am going to move on the first of next month.\n#Person2#: Congratulations! So, have you started planning your move?\n#Person1#: Yes, I am trying to find good movers now. Do you know any?\n#Person2#: Not really.\n#Person1#: What did you do for your last move?\n#Person2#: Since I did not have much staff, I asked my cousin to help me out.\n#Person1#: Oh. that's very nice of him.\n#Person2#: Yeah, it was. But I still have some information that I collected last time, and if you want, I will give it to you.\n#Person1#: Yes, please."

In [41]:
in_ids[0]

tensor([[    0, 10431, 41761,   134, 10431,    35, 12289,     6,  1483,     4,
            38,   524,   164,     7,   517,    15,     5,    78,     9,   220,
           353,     4, 50118, 10431, 41761,   176, 10431,    35, 24953,   328,
           407,     6,    33,    47,   554,  1884,   110,   517,   116, 50118,
         10431, 41761,   134, 10431,    35,  3216,     6,    38,   524,   667,
             7,   465,   205,  7458,  3697,   122,     4,  1832,    47,   216,
           143,   116, 50118, 10431, 41761,   176, 10431,    35,  1491,   269,
             4, 50118, 10431, 41761,   134, 10431,    35,   653,   222,    47,
           109,    13,   110,    94,   517,   116, 50118, 10431, 41761,   176,
         10431,    35,  1773,    38,   222,    45,    33,   203,   813,     6,
            38,   553,   127, 11204,     7,   244,   162,    66,     4, 50118,
         10431, 41761,   134, 10431,    35,  5534,     4,    14,    18,   182,
          2579,     9,   123,     4, 50118, 10431, 4

In [42]:
in_ids

0     [[tensor(0), tensor(10431), tensor(41761), ten...
1     [[tensor(0), tensor(10431), tensor(41761), ten...
2     [[tensor(0), tensor(10431), tensor(41761), ten...
3     [[tensor(0), tensor(10431), tensor(41761), ten...
4     [[tensor(0), tensor(10431), tensor(41761), ten...
                            ...                        
95    [[tensor(0), tensor(10431), tensor(41761), ten...
96    [[tensor(0), tensor(10431), tensor(41761), ten...
97    [[tensor(0), tensor(10431), tensor(41761), ten...
98    [[tensor(0), tensor(10431), tensor(41761), ten...
99    [[tensor(0), tensor(10431), tensor(41761), ten...
Name: dialogue, Length: 100, dtype: object

In [44]:
def summarization(item):
    summary_ids = model.generate(item, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return final_summary

In [47]:
summarization(in_ids[1])

'#Person1#: Hello, this is Francis from XYZ Company. I would like to discuss with you about a new project some time next week. Do you have time to meet?#Person2#: Well, I am not sure. Lets me check my agenda.'

In [48]:
df['dialogue'][1]

'#Person1#: Hello, this is Francis from XYZ Company. I would like to discuss with you about a new project some time next week. Do you have time to meet?\n#Person2#: Well, I am not sure. Lets me check my agenda\n#Person1#: Ok.\n#Person2#: I am free on Tuesday afternoon from 2 PM to 4 PM, is it convenient for you?\n#Person1#: It is ok with me. So I will be arriving at your office around 2 fifteen PM. Do you have a projector in your room? I would like to show you some related charts\n#Person2#: Yes, there is. See you then.'

In [50]:
summarization(in_ids[2])

"#Person1#: We go in over there. Gate B. Peter said they're pretty good tickets. #Person2#: It's a good job that we got here early. They'll never get in."

In [51]:
df['dialogue'][2]

"#Person1#: It's lucky that we rode our bike here instead of driving.\n#Person2#: It's a good job that we got here early. Look at all those cars there. They'll never get in.\n#Person1#: You'd better follow me closely. I don't want to lose you.\n#Person2#: Don't worry. I'll keep up.\n#Person1#: We go in over there. Gate B. Peter said they're pretty good tickets.\n#Person2#: Where are they?\n#Person1#: They're right behind the goal.\n#Person2#: Oh,do we have to stand up all the time?\n#Person1#: That's right.\n#Person2#: I hope we can see the match clearly.\n#Person1#: That's why we've come early. The earlier, the better."