In [None]:
! pip install datasets transformers accelerate peft

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.4 M

## Datasets

- Text Classification - dair-ai/emotion
- Question Answering - mlqa.hi.en
- Question Generation - squad
- Paraphrasing - paws
- Summarization - samsum
- Text Generation - sadFaceEmoji/english-poems
- Semantic Similarity - paws

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd

# Text Classification

In [None]:
# dair-ai/emotion - Text Classification

text_classification_data = load_dataset("dair-ai/emotion", "split", split="test")
text_classification_data

Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 2000
})

In [None]:
text_classification_df = text_classification_data.to_pandas()
text_classification_df

Unnamed: 0,text,label
0,im feeling rather rotten so im not very ambiti...,0
1,im updating my blog because i feel shitty,0
2,i never make her separate from me because i do...,0
3,i left with my bouquet of red and yellow tulip...,1
4,i was feeling a little vain when i did this one,0
...,...,...
1995,i just keep feeling like someone is being unki...,3
1996,im feeling a little cranky negative after this...,3
1997,i feel that i am useful to my people and that ...,1
1998,im feeling more comfortable with derby i feel ...,1


In [None]:
map_emotions = {
    0: "sadness",
    1: "joy",
    2: "love",
    3: "anger",
    4: "fear",
    5: "surprise"
}

text_classification_df["target"] = text_classification_df["label"].apply(lambda x: map_emotions[x])
text_classification_df["source"] = text_classification_df["text"].apply(lambda x: "text-classification-emotion: " + x)

text_classification_df = text_classification_df[["source", "target"]]
text_classification_df

Unnamed: 0,source,target
0,text-classification-emotion: im feeling rather...,sadness
1,text-classification-emotion: im updating my bl...,sadness
2,text-classification-emotion: i never make her ...,sadness
3,text-classification-emotion: i left with my bo...,joy
4,text-classification-emotion: i was feeling a l...,sadness
...,...,...
1995,text-classification-emotion: i just keep feeli...,anger
1996,text-classification-emotion: im feeling a litt...,anger
1997,text-classification-emotion: i feel that i am ...,joy
1998,text-classification-emotion: im feeling more c...,joy


In [None]:
text_classification_df = text_classification_df.sample(500)
text_classification_df

Unnamed: 0,source,target
1488,text-classification-emotion: i managed however...,love
726,text-classification-emotion: i lift different ...,fear
325,text-classification-emotion: i bought it at ur...,sadness
1961,text-classification-emotion: i remember wantin...,love
998,text-classification-emotion: i feel like he mo...,joy
...,...,...
1250,text-classification-emotion: i did feel for hi...,sadness
236,text-classification-emotion: i feel like some ...,sadness
1027,text-classification-emotion: i think were on a...,fear
1241,text-classification-emotion: i could maybe get...,sadness


# Question Answering

In [None]:
data = load_dataset("mlqa", "mlqa.hi.en", split="validation")

Downloading builder script:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/114k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/34.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.7M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4918 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/507 [00:00<?, ? examples/s]

In [None]:
qa_df = data.to_pandas()
qa_df

Unnamed: 0,context,question,answers,id
0,पैरेनकाइमा कोशिकाएं वे जीवित कोशिकाएं हैं जो स...,What is the name of a type of string that cons...,"{'answer_start': [2267], 'text': ['कोलेन्काइमा']}",60ee75c50c8472be7cce1a24ee2cd7409ee4dd52
1,पैरेनकाइमा कोशिकाएं वे जीवित कोशिकाएं हैं जो स...,What are made up almost entirely of parenchyma...,"{'answer_start': [158], 'text': ['पत्तियां']}",561971f7978f678c3d1ba2a946036cdc131c4d49
2,जाइलेम कोशिकाएं कोशिका भित्तियों की लिग्निकृत ...,What kind of plants possess xylem?,"{'answer_start': [292], 'text': ['ट्रेकियोफाइट...",18a5c05701b7359fcd32a379b2ac4a9a5d7544de
3,फ्लोएम उच्चतर पौधों में आहार का संवहन करने वाल...,What is the leptome?,"{'answer_start': [969], 'text': ['समान कार्य क...",694ce8b90f6854bcd6c3a505767df90f8b659d06
4,"सभी हवाई अवयवों की बाह्यत्वचा, जड़ों को छोड़कर...",Epidermal cells can do what?,"{'answer_start': [433], 'text': ['क्यटिन का सं...",25be1db3766bfbb5f64b3a8f4bf4af57470eb651
...,...,...,...,...
502,स्पार्टाकस नाम वैसे काले सागर क्षेत्र में साक्...,which King?,"{'answer_start': [89], 'text': ['थ्रेसियन राजव...",c555e59e9cf74869aaf94cbeaf932cd1fd89e55e
503,इस समय तक पोम्पे की टुकड़ी स्पेन लौट चुकी थी औ...,What happened to captured soldiers?,"{'answer_start': [1094], 'text': ['क्रूस पर चढ...",1f79f28190b12342fc7a39cea0202d33af7488c9
504,"आर्गन-39, 269 वर्ष की एक आधा जीवन के साथ, आवेद...",What is the half-life of Argon 39?,"{'answer_start': [10], 'text': ['269 वर्ष']}",cc81fb916f547d7d51730e0f726891179b361ebd
505,जॉनस्टन का जन्म व बचपन ओंटारियो में बीता। वहाँ...,What role in government did Stephen Harper have?,"{'answer_start': [669], 'text': ['प्रधानमंत्री']}",244b552d2f8e850eb0b88d4c7440c575639fa911


In [None]:
qa_df["source"] = qa_df[["context", "question"]].apply(lambda x: "context-question-answering: context: " + x["context"] + " question: " + x["question"], axis=1)
qa_df["target"] = qa_df["answers"].apply(lambda x: x["text"][0])

qa_df = qa_df[["source", "target"]]
qa_df

Unnamed: 0,source,target
0,context-question-answering: context: पैरेनकाइम...,कोलेन्काइमा
1,context-question-answering: context: पैरेनकाइम...,पत्तियां
2,context-question-answering: context: जाइलेम को...,ट्रेकियोफाइटों
3,context-question-answering: context: फ्लोएम उच...,समान कार्य करने वाला एक अधिक सरल ऊतक
4,context-question-answering: context: सभी हवाई ...,क्यटिन का संश्लेषण
...,...,...
502,context-question-answering: context: स्पार्टाक...,थ्रेसियन राजवंश के राजा
503,context-question-answering: context: इस समय तक...,क्रूस पर चढ़ा दिया गया।
504,"context-question-answering: context: आर्गन-39,...",269 वर्ष
505,context-question-answering: context: जॉनस्टन क...,प्रधानमंत्री


# Question Generation

In [None]:
data = load_dataset("squad_v2", split="validation")
qgen_df = data.to_pandas()
qgen_df = qgen_df.sample(500)

Downloading builder script:   0%|          | 0.00/5.28k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.02k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/801k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
qgen_df

Unnamed: 0,id,title,context,question,answers
4592,5a669b3df038b7001ab0c04e,Packet_switching,Internet2 is a not-for-profit United States co...,What was the first Internet2 Network created w...,"{'text': [], 'answer_start': []}"
11385,5733fd66d058e614000b6737,French_and_Indian_War,The new British command was not in place until...,"Where did Moncalm slip away to attack, left la...","{'text': ['Oswego', 'Oswego', 'Oswego', 'Osweg..."
8998,5729a3716aef05140015506d,Prime_number,Prime numbers have influenced many artists and...,In which etude of Neumes rythmiques do the pri...,"{'text': ['the third étude', 'third', 'third',..."
3728,572646655951b619008f6ec0,Ctenophora,Ctenophores form an animal phylum that is more...,What do ctenophores have that no other animals...,"{'text': ['colloblasts', 'colloblasts', 'collo..."
1952,5ad3e8f6604f3c001a3ff66b,Steam_engine,The most useful instrument for analyzing the p...,What instrument is used to examine diagram per...,"{'text': [], 'answer_start': []}"
...,...,...,...,...,...
9178,5ad27abcd7d075001a42962d,Rhine,The Rhine is the longest river in Germany. It ...,How much water does the Rhine discharge at the...,"{'text': [], 'answer_start': []}"
3745,5a835778e60761001a2eb5dd,Ctenophora,"Like sponges and cnidarians, ctenophores have ...",How many layers of cells with a sandwiched jel...,"{'text': [], 'answer_start': []}"
5229,5a6cb7ce4eec6b001a80a639,Pharmacy,A Pharmacy Technician in the UK is considered ...,With what body must a pharmacy not be allowed ...,"{'text': [], 'answer_start': []}"
4012,5a7b070121c2de001afe9cee,"Fresno,_California","In September 1958, Bank of America launched a ...",The BankAmericard could be used across many me...,"{'text': [], 'answer_start': []}"


In [None]:
qgen_df["source"] = qgen_df["context"].apply(lambda x: "context-question-generation: " + x)
qgen_df["target"] = qgen_df["question"]

In [None]:
qgen_df = qgen_df[["source", "target"]]
qgen_df

Unnamed: 0,source,target
4592,context-question-generation: Internet2 is a no...,What was the first Internet2 Network created w...
11385,context-question-generation: The new British c...,"Where did Moncalm slip away to attack, left la..."
8998,context-question-generation: Prime numbers hav...,In which etude of Neumes rythmiques do the pri...
3728,context-question-generation: Ctenophores form ...,What do ctenophores have that no other animals...
1952,context-question-generation: The most useful i...,What instrument is used to examine diagram per...
...,...,...
9178,context-question-generation: The Rhine is the ...,How much water does the Rhine discharge at the...
3745,context-question-generation: Like sponges and ...,How many layers of cells with a sandwiched jel...
5229,context-question-generation: A Pharmacy Techni...,With what body must a pharmacy not be allowed ...
4012,context-question-generation: In September 1958...,The BankAmericard could be used across many me...


# Paraphrasing

In [None]:
data = load_dataset("paws", "labeled_final", split="validation")
trans_df = data.to_pandas()
trans_df = trans_df.sample(1000)
trans_df

Downloading builder script:   0%|          | 0.00/8.43k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.52k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.34k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Unnamed: 0,id,sentence1,sentence2,label
7559,7560,The only other Jennings township is in Putnam ...,"Statewide , the only other Jennings Township i...",1
2840,2841,"Pete Sampras won 6 -- 7 , 6 -- 4 , 7 -- 6 agai...",Tim Tim Henman won against Pete Sampras in the...,0
1096,1097,"It is based in Luxembourg City , to the south ...","It is based in Mondercange , south of Luxembou...",0
6607,6608,This festival also gives the participants an o...,This festival also gives participants the oppo...,0
5382,5383,The river Slatina is a tributary of the Bresni...,The Slatina River is a tributary of the Bresni...,1
...,...,...,...,...
5946,5947,"After the first round of the duel , Heine Totl...",After the first round of duels Heine Totland b...,1
5341,5342,"In 2002 , the song was covered by British prod...","In 2002 , the song was released by British pro...",0
353,354,"On 21 June 2016 , Cristina Ferrare replaced De...","On June 21 , 2016 , Cristina Ferrare replaced ...",0
1627,1628,The BBC College of Journalism was opened as an...,"In June 2005 , the BBC College of Journalism w...",1


In [None]:
trans_df = trans_df[trans_df["label"]==1]
trans_df = trans_df[["sentence1", "sentence2"]]
trans_df

Unnamed: 0,sentence1,sentence2
7559,The only other Jennings township is in Putnam ...,"Statewide , the only other Jennings Township i..."
5382,The river Slatina is a tributary of the Bresni...,The Slatina River is a tributary of the Bresni...
4433,It is well known from the United States ( Main...,"It is known from the United States ( Maine , O..."
1222,Georgetown is also a source of drinking water ...,Georgetown is also a source of drinking water ...
2227,"In 1997 , she appeared as Sheila Dixon in `` C...",Hardwick appeared in `` Coronation Street '' i...
...,...,...
2756,"de Ruiter , who was born in Leiden , played fo...",de Ruiter was born in Leiden and played for RK...
3354,The Azusa Campus at Azusa Pacific University i...,Azusa Pacific University 's Azusa campus is lo...
5946,"After the first round of the duel , Heine Totl...",After the first round of duels Heine Totland b...
1627,The BBC College of Journalism was opened as an...,"In June 2005 , the BBC College of Journalism w..."


In [None]:
trans_df["source"] = trans_df["sentence1"].apply(lambda x: "paraphrase: " + x)
trans_df["target"] = trans_df["sentence2"]
trans_df

Unnamed: 0,sentence1,sentence2,source,target
7559,The only other Jennings township is in Putnam ...,"Statewide , the only other Jennings Township i...",paraphrase: The only other Jennings township i...,"Statewide , the only other Jennings Township i..."
5382,The river Slatina is a tributary of the Bresni...,The Slatina River is a tributary of the Bresni...,paraphrase: The river Slatina is a tributary o...,The Slatina River is a tributary of the Bresni...
4433,It is well known from the United States ( Main...,"It is known from the United States ( Maine , O...",paraphrase: It is well known from the United S...,"It is known from the United States ( Maine , O..."
1222,Georgetown is also a source of drinking water ...,Georgetown is also a source of drinking water ...,paraphrase: Georgetown is also a source of dri...,Georgetown is also a source of drinking water ...
2227,"In 1997 , she appeared as Sheila Dixon in `` C...",Hardwick appeared in `` Coronation Street '' i...,"paraphrase: In 1997 , she appeared as Sheila D...",Hardwick appeared in `` Coronation Street '' i...
...,...,...,...,...
2756,"de Ruiter , who was born in Leiden , played fo...",de Ruiter was born in Leiden and played for RK...,"paraphrase: de Ruiter , who was born in Leiden...",de Ruiter was born in Leiden and played for RK...
3354,The Azusa Campus at Azusa Pacific University i...,Azusa Pacific University 's Azusa campus is lo...,paraphrase: The Azusa Campus at Azusa Pacific ...,Azusa Pacific University 's Azusa campus is lo...
5946,"After the first round of the duel , Heine Totl...",After the first round of duels Heine Totland b...,paraphrase: After the first round of the duel ...,After the first round of duels Heine Totland b...
1627,The BBC College of Journalism was opened as an...,"In June 2005 , the BBC College of Journalism w...",paraphrase: The BBC College of Journalism was ...,"In June 2005 , the BBC College of Journalism w..."


In [None]:
trans_df = trans_df[["source", "target"]]
trans_df

Unnamed: 0,source,target
7559,paraphrase: The only other Jennings township i...,"Statewide , the only other Jennings Township i..."
5382,paraphrase: The river Slatina is a tributary o...,The Slatina River is a tributary of the Bresni...
4433,paraphrase: It is well known from the United S...,"It is known from the United States ( Maine , O..."
1222,paraphrase: Georgetown is also a source of dri...,Georgetown is also a source of drinking water ...
2227,"paraphrase: In 1997 , she appeared as Sheila D...",Hardwick appeared in `` Coronation Street '' i...
...,...,...
2756,"paraphrase: de Ruiter , who was born in Leiden...",de Ruiter was born in Leiden and played for RK...
3354,paraphrase: The Azusa Campus at Azusa Pacific ...,Azusa Pacific University 's Azusa campus is lo...
5946,paraphrase: After the first round of the duel ...,After the first round of duels Heine Totland b...
1627,paraphrase: The BBC College of Journalism was ...,"In June 2005 , the BBC College of Journalism w..."


# Summarization

In [None]:
! pip install py7zr

Collecting py7zr
  Downloading py7zr-0.20.6-py3-none-any.whl (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.6.6 (from py7zr)
  Downloading pycryptodomex-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyzstd>=0.14.4 (from py7zr)
  Downloading pyzstd-0.15.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (412 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.3/412.3 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyppmd<1.1.0,>=0.18.1 (from py7zr)
  Downloading pyp

In [None]:
data = load_dataset("samsum", split="test")
summ_df = data.to_pandas()
summ_df = summ_df[:500]
summ_df

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Unnamed: 0,id,dialogue,summary
0,13862856,"Hannah: Hey, do you have Betty's number?\nAman...",Hannah needs Betty's number but Amanda doesn't...
1,13729565,Eric: MACHINE!\r\nRob: That's so gr8!\r\nEric:...,Eric and Rob are going to watch a stand-up on ...
2,13680171,"Lenny: Babe, can you help me with something?\r...",Lenny can't decide which trousers to buy. Bob ...
3,13729438,"Will: hey babe, what do you want for dinner to...",Emma will be home soon and she will let Will k...
4,13828600,"Ollie: Hi , are you in Warsaw\r\nJane: yes, ju...",Jane is in Warsaw. Ollie and Jane has a party....
...,...,...,...
495,13828505,"Amber: Hi Erin, guess what, John and Annie are...",John and Annie are moving to London because he...
496,13716957,Vicky: Is everyone still at town sq? I might h...,"They are meeting at Fratellis, upstairs."
497,13829569,Kaylin: <file_gif> \r\nKaylin: that's what app...,Kaylin has set her alarm for tomorrow.
498,13729275,Keira: How come it started with wildangel3 the...,Keira wonders why wildangel plays in the wrong...


In [None]:
summ_df["source"] = summ_df["dialogue"].apply(lambda x: "conversation-summarization: " + x)
summ_df["target"] = summ_df["summary"]
summ_df = summ_df[["source", "target"]]
summ_df

Unnamed: 0,source,target
0,"conversation-summarization: Hannah: Hey, do yo...",Hannah needs Betty's number but Amanda doesn't...
1,conversation-summarization: Eric: MACHINE!\r\n...,Eric and Rob are going to watch a stand-up on ...
2,"conversation-summarization: Lenny: Babe, can y...",Lenny can't decide which trousers to buy. Bob ...
3,"conversation-summarization: Will: hey babe, wh...",Emma will be home soon and she will let Will k...
4,"conversation-summarization: Ollie: Hi , are yo...",Jane is in Warsaw. Ollie and Jane has a party....
...,...,...
495,"conversation-summarization: Amber: Hi Erin, gu...",John and Annie are moving to London because he...
496,conversation-summarization: Vicky: Is everyone...,"They are meeting at Fratellis, upstairs."
497,conversation-summarization: Kaylin: <file_gif>...,Kaylin has set her alarm for tomorrow.
498,conversation-summarization: Keira: How come it...,Keira wonders why wildangel plays in the wrong...


# Text Generation

In [None]:
data = load_dataset("sadFaceEmoji/english-poems", split="train")
text_gen_df = data.to_pandas()
text_gen_df = text_gen_df.sample(500)

Downloading readme:   0%|          | 0.00/131 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/20.3M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
text_gen_df

Unnamed: 0,id,poem
57682,57682,he sighing paused\nlo then the serpent hissed\...
36650,36650,beneath my palm trees by the river side\ni sat...
52942,52942,my heart is wildly beating\ncome bridegroom co...
17258,17258,the true strains rise only from\nthe rich red ...
12957,12957,like the lizard in the furious noon\nthat drop...
...,...,...
51820,51820,but when he came to denver in that fall uv\nhi...
54647,54647,the skiff was like a crescent ghost of some mo...
7558,7558,and if so\nthe poetry of fall\nis the dog of m...
6641,6641,for breakfast a man must break an egg\nthen no...


In [None]:
text_gen_df = text_gen_df.dropna()
text_gen_df

Unnamed: 0,id,poem
57682,57682,he sighing paused\nlo then the serpent hissed\...
36650,36650,beneath my palm trees by the river side\ni sat...
52942,52942,my heart is wildly beating\ncome bridegroom co...
17258,17258,the true strains rise only from\nthe rich red ...
12957,12957,like the lizard in the furious noon\nthat drop...
...,...,...
51820,51820,but when he came to denver in that fall uv\nhi...
54647,54647,the skiff was like a crescent ghost of some mo...
7558,7558,and if so\nthe poetry of fall\nis the dog of m...
6641,6641,for breakfast a man must break an egg\nthen no...


In [None]:
a = text_gen_df.iloc[0]["poem"].split(" ")

In [None]:
" ".join(a)

'he sighing paused\nlo then the serpent hissed\nin impotent rage depart\nand how depart'

In [None]:
import random

def poem_splitter(text):
    word_list = text.split(" ")

    if len(word_list) >=0:

        num_words = random.randint(0, len(word_list)//2)

        source = " ".join(word_list[:num_words])

        target = " ".join(word_list[num_words: ])

    else:

        source = "drop"

        target = "drop"

    return [source, target]



In [None]:
text_gen_df["source"] = text_gen_df["poem"].apply(lambda x: poem_splitter(x))
text_gen_df["target"] = text_gen_df["source"].apply(lambda x: x[1])
text_gen_df["source"] = text_gen_df["source"].apply(lambda x: "poem-completion: " + x[0])

In [None]:
text_gen_df

Unnamed: 0,id,poem,source,target
57682,57682,he sighing paused\nlo then the serpent hissed\...,poem-completion: he sighing paused\nlo,then the serpent hissed\nin impotent rage depa...
36650,36650,beneath my palm trees by the river side\ni sat...,poem-completion: beneath my palm trees by the ...,a weeping: in the whole world wide\nthere was ...
52942,52942,my heart is wildly beating\ncome bridegroom co...,poem-completion:,my heart is wildly beating\ncome bridegroom co...
17258,17258,the true strains rise only from\nthe rich red ...,poem-completion: the true strains rise only fr...,and singers together or\napart beyond friendsh...
12957,12957,like the lizard in the furious noon\nthat drop...,poem-completion: like the lizard,in the furious noon\nthat drops his legs and c...
...,...,...,...,...
51820,51820,but when he came to denver in that fall uv\nhi...,poem-completion: but when he came to denver in...,spree\nthe very thought uv seein' dana worked ...
54647,54647,the skiff was like a crescent ghost of some mo...,poem-completion: the skiff was like a crescent,ghost of some moon departed\nfrail white she r...
7558,7558,and if so\nthe poetry of fall\nis the dog of m...,poem-completion: and if so\nthe poetry of fall...,untied\nat last from the rope\nof the world\nd...
6641,6641,for breakfast a man must break an egg\nthen no...,poem-completion: for breakfast a man must brea...,all the king's men can do very much about it\n...


In [None]:
text_gen_df = text_gen_df[["source", "target"]]

text_gen_df

Unnamed: 0,source,target
57682,poem-completion: he sighing paused\nlo,then the serpent hissed\nin impotent rage depa...
36650,poem-completion: beneath my palm trees by the ...,a weeping: in the whole world wide\nthere was ...
52942,poem-completion:,my heart is wildly beating\ncome bridegroom co...
17258,poem-completion: the true strains rise only fr...,and singers together or\napart beyond friendsh...
12957,poem-completion: like the lizard,in the furious noon\nthat drops his legs and c...
...,...,...
51820,poem-completion: but when he came to denver in...,spree\nthe very thought uv seein' dana worked ...
54647,poem-completion: the skiff was like a crescent,ghost of some moon departed\nfrail white she r...
7558,poem-completion: and if so\nthe poetry of fall...,untied\nat last from the rope\nof the world\nd...
6641,poem-completion: for breakfast a man must brea...,all the king's men can do very much about it\n...


# NLI

In [None]:
data = load_dataset("paws", "labeled_final", split="test")
nli_df = data.to_pandas()
nli_df = nli_df.sample(500)

nli_df

Unnamed: 0,id,sentence1,sentence2,label
4331,4332,Liv meets Tyler to ask him to leave her brothe...,Tyler meets Liv to ask him to leave her brothe...,0
4014,4015,At first he recruited singer Ali Azmat of the ...,"First , he recruited singer Nusrat Hussain fro...",0
3889,3890,They do not follow the pattern of caldera grow...,They do not follow the pattern of caldera - gr...,1
2857,2858,Both electromagnetic brakes and eddy current b...,Both electromagnetic brakes and eddy current b...,0
223,224,He returned to South Africa and went to the Ki...,He returned to South Africa and went to King E...,1
...,...,...,...,...
1128,1129,Misc Music is a compilation album by Shifty Tr...,Music is a compilation album by Unbelievable T...,0
126,127,"Oecomys rutilus , also known as the red arbore...","Oecomys rutilus , also known as the reddish oe...",0
6360,6361,Grace Grace Elliott also appears as a major ch...,Hallie Rubenhold also appears as a major chara...,0
6587,6588,"Meanwhile , Ben begins to connect with Norman ...","Meanwhile , Ben starts to connect with Norman ...",1


In [None]:
nli_df["source"] = nli_df[["sentence1", "sentence2"]].apply(lambda x: "nli-hypothesis-check: sentence1: " + x["sentence1"] + " sentence2: " + x["sentence2"], axis=1)
nli_df

Unnamed: 0,id,sentence1,sentence2,label,source
4331,4332,Liv meets Tyler to ask him to leave her brothe...,Tyler meets Liv to ask him to leave her brothe...,0,nli-hypothesis-check: sentence1: Liv meets Tyl...
4014,4015,At first he recruited singer Ali Azmat of the ...,"First , he recruited singer Nusrat Hussain fro...",0,nli-hypothesis-check: sentence1: At first he r...
3889,3890,They do not follow the pattern of caldera grow...,They do not follow the pattern of caldera - gr...,1,nli-hypothesis-check: sentence1: They do not f...
2857,2858,Both electromagnetic brakes and eddy current b...,Both electromagnetic brakes and eddy current b...,0,nli-hypothesis-check: sentence1: Both electrom...
223,224,He returned to South Africa and went to the Ki...,He returned to South Africa and went to King E...,1,nli-hypothesis-check: sentence1: He returned t...
...,...,...,...,...,...
1128,1129,Misc Music is a compilation album by Shifty Tr...,Music is a compilation album by Unbelievable T...,0,nli-hypothesis-check: sentence1: Misc Music is...
126,127,"Oecomys rutilus , also known as the red arbore...","Oecomys rutilus , also known as the reddish oe...",0,nli-hypothesis-check: sentence1: Oecomys rutil...
6360,6361,Grace Grace Elliott also appears as a major ch...,Hallie Rubenhold also appears as a major chara...,0,nli-hypothesis-check: sentence1: Grace Grace E...
6587,6588,"Meanwhile , Ben begins to connect with Norman ...","Meanwhile , Ben starts to connect with Norman ...",1,"nli-hypothesis-check: sentence1: Meanwhile , B..."


In [None]:
hypo_dict = {
    0: "contradiction",
    1: "entailment"
}

nli_df["target"] = nli_df["label"].apply(lambda x: hypo_dict[x])
nli_df = nli_df[["source", "target"]]
nli_df

Unnamed: 0,source,target
4331,nli-hypothesis-check: sentence1: Liv meets Tyl...,contradiction
4014,nli-hypothesis-check: sentence1: At first he r...,contradiction
3889,nli-hypothesis-check: sentence1: They do not f...,entailment
2857,nli-hypothesis-check: sentence1: Both electrom...,contradiction
223,nli-hypothesis-check: sentence1: He returned t...,entailment
...,...,...
1128,nli-hypothesis-check: sentence1: Misc Music is...,contradiction
126,nli-hypothesis-check: sentence1: Oecomys rutil...,contradiction
6360,nli-hypothesis-check: sentence1: Grace Grace E...,contradiction
6587,"nli-hypothesis-check: sentence1: Meanwhile , B...",entailment


# Final Data

In [None]:
list_of_dfs = [text_classification_df, qgen_df, qa_df, nli_df, text_gen_df, summ_df, trans_df]

final_df = pd.concat(list_of_dfs, axis=0)
final_df

Unnamed: 0,source,target
1488,text-classification-emotion: i managed however...,love
726,text-classification-emotion: i lift different ...,fear
325,text-classification-emotion: i bought it at ur...,sadness
1961,text-classification-emotion: i remember wantin...,love
998,text-classification-emotion: i feel like he mo...,joy
...,...,...
2756,"paraphrase: de Ruiter , who was born in Leiden...",de Ruiter was born in Leiden and played for RK...
3354,paraphrase: The Azusa Campus at Azusa Pacific ...,Azusa Pacific University 's Azusa campus is lo...
5946,paraphrase: After the first round of the duel ...,After the first round of duels Heine Totland b...
1627,paraphrase: The BBC College of Journalism was ...,"In June 2005 , the BBC College of Journalism w..."


# Finetune

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split

In [None]:
train_df, test_df = train_test_split(final_df, test_size=0.1, random_state=0, shuffle=True)
train_data = Dataset.from_pandas(train_df)
test_data = Dataset.from_pandas(test_df)

In [None]:
model_id="google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def preprocess_function(sample,padding="max_length"):
    model_inputs = tokenizer(sample["source"], max_length=256, padding=padding, truncation=True)
    labels = tokenizer(sample["target"], max_length=128, padding=padding, truncation=True)
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_tokenized_dataset = train_data.map(preprocess_function, batched=True, remove_columns=train_data.column_names)
test_tokenized_dataset = test_data.map(preprocess_function, batched=True, remove_columns=test_data.column_names)
print(f"Keys of tokenized dataset: {list(train_tokenized_dataset.features)}")

Map:   0%|          | 0/3080 [00:00<?, ? examples/s]

Map:   0%|          | 0/343 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [None]:
lora_config = LoraConfig(
 r=8,
 lora_alpha=16,
 lora_dropout=0.1,
 bias="none",
 task_type="SEQ_2_SEQ_LM",
 target_modules=["q", "v"]
)

In [None]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 785,509,376 || trainable%: 0.30035236651331837


In [None]:
model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 1024)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 1024)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): Linear(
                    in_features=1024, out_features=1024, bias=False
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_

In [None]:
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
output_dir="flant5-large-aio"
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=6,
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    report_to="tensorboard",
    push_to_hub = True
)

In [None]:
model.config.use_cache = False

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized_dataset
)

In [None]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
514,2.1415


TrainOutput(global_step=514, training_loss=2.141537202471425, metrics={'train_runtime': 695.2118, 'train_samples_per_second': 4.43, 'train_steps_per_second': 0.739, 'total_flos': 3560504190566400.0, 'train_loss': 2.141537202471425, 'epoch': 1.0})

In [None]:
peft_save_model_id="flant5-large-tuned"
trainer.model.save_pretrained(peft_save_model_id, push_to_hub=True)
tokenizer.save_pretrained(peft_save_model_id, push_to_hub=True)
trainer.model.base_model.save_pretrained(peft_save_model_id, push_to_hub=True)