# [Text2Text](https://github.com/artitw/text2text): Crosslingual NLP/G toolkit

## How Crosslingual Models Work (click to watch)
[![Crosslingual Models](http://img.youtube.com/vi/caZLVcJqsqo/0.jpg)](https://youtu.be/caZLVcJqsqo "Crosslingual Models")

In [None]:
%%bash
pip install -qq -U text2text

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.3/75.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import text2text as t2t

In [None]:
### Assistant
# Free private open source alternative to commercial LLMs.
# Commercial LLMs are costly, collect your data, impose quotas and rate limits that hinder development.
# Run at no cost on Google Colab free tier, so you don't even need your own device.
# For more examples, see https://colab.research.google.com/drive/1K6Kk80w9vjFZ7PL9dPRgVuOPuaWcY4ae
# To add a knowledge base, see https://colab.research.google.com/drive/1hkNgpSmmUA-mzUibqz25xq-E8KYOLuVx?usp=sharing

asst = t2t.Assistant()

# Streaming example
chat_history = [
    {"role": "user",  "content": "Hi"},
    {"role": "assistant", "content": "Hello, how are you?"},
    {"role": "user", "content": "What should I do today?"}
]

result = asst.chat_completion(chat_history, stream=True) #{'role': 'assistant', 'content': '1. Make a list of things to be grateful for.\n2. Go outside and take a walk in nature.\n3. Practice mindfulness meditation.\n4. Connect with a loved one or friend.\n5. Do something kind for someone else.\n6. Engage in a creative activity like drawing or writing.\n7. Read an uplifting book or listen to motivational podcasts.'}
for chunk in result:
  print(chunk['message']['content'], end='', flush=True)

In [None]:
# clear memory for other models
del assistant

In [None]:
### Tokenizer
tokenizer = t2t.Tokenizer()
tokenizer.transform(["Hello, World!"])
#[['▁Hello', ',', '▁World', '!']]

Better speed can be achieved with apex installed.


Downloading (…)okenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

[['▁Hello', ',', '▁World', '!']]

In [None]:
del tokenizer

In [None]:
vectorizer = t2t.Vectorizer()
vectorizer.transform(["Hello, World!"]) #array([[0.18745188, 0.05658336, 0.15895301, ..., 0.46946704, 0.6332584 , 0.43805206]], dtype=float32)

Downloading:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

array([[0.0125036 , 0.00377429, 0.01060265, ..., 0.03131489, 0.04224026,
        0.0292194 ]], dtype=float32)

In [None]:
del vectorizer

In [None]:
tfidfer = t2t.Tfidfer()
tfidfer.transform(["Hello, World!"]) #[{'!': 0.5, ',': 0.5, '▁Hello': 0.5, '▁World': 0.5}]

[{'▁Hello': 0.5, ',': 0.5, '▁World': 0.5, '!': 0.5}]

In [None]:
del tfidfer

In [None]:
bm25er = t2t.Bm25er()
bm25er.transform(["Hello, World!"]) #[{'!': 0.5, ',': 0.5, '▁Hello': 0.5, '▁World': 0.5}]

[{'▁Hello': 0.3068528194400547,
  ',': 0.3068528194400547,
  '▁World': 0.3068528194400547,
  '!': 0.3068528194400547}]

In [None]:
del bm25er

In [None]:
### Index for information retrieval
# [STF-IDF Demo] https://colab.research.google.com/drive/1RaWj5SqWvyC2SsCTGg8IAVcl9G5hOB50?usp=sharing

index = t2t.Indexer().transform([
  "Let's go hiking tomorrow, let's go!",
  "안녕하세요.",
  "돼지꿈을 꾸세요~~",
])

index.retrieve(["돼지"], k=1) #[['"돼지꿈을 꾸세요~~"']]

# Add documents incrementing on ids if none specified
index.add(["Hello, World! 你好,世界!"])

# Remove by ids
index.remove([2]) #Removes "안녕하세요."

# Retrieve k results per query sorted by distance
index.retrieve(["你好, World"], k=3)

# To learn more, see [STF-IDF](https://arxiv.org/abs/2209.14281).

Creating index with 128104 dimensions.


(array([[0.7752551],
        [0.8452994],
        [0.4347524]], dtype=float32), array([[102],
        [101],
        [100]]))

In [None]:
del index

In [None]:
translator = t2t.Translator()
translator.transform(["Hello, World!"], src_lang="en", tgt_lang="zh") #['你好,世界!']



['你好,世界!']

In [None]:
del translator

In [None]:
### Variation
# Useful for augmenting training data
variator = t2t.Variator()
variator.transform(["Hello, World!"], src_lang="en") #['Hello the world!', 'Welcome to the world.', 'Hello to the world!',...

['Hello the world!',
 'My brother and sister!',
 'Welcome to the world!',
 'Hello, the world',
 '“No to the world”',
 'It is clearly!',
 'Welcome to the light!',
 'Hello to the world!',
 'Hello to the world!',
 'Hello, it’s a good thing!',
 'Congratulations to all!',
 'Hello to the world!',
 'Hello to the world!',
 'Good Morning, the World',
 'Welcome to the World!',
 'Hello the world!',
 'Hello the world!',
 'Good morning to the world!',
 'Hello, World!',
 'Hello the world!',
 'Fortunately the world!',
 'Hello to the world!',
 'We do it!',
 'Welcome to the world!',
 'Hello to the world!',
 'Go to the world!',
 'Member of the Board of Directors of the Board of Directors of the Board of Directors of the Board of Directors',
 'The world.',
 'Hello to the world!',
 'and it is!',
 'Hello, for the world!',
 'Hello to the world!',
 'Hello to the world!',
 'Hello to the world!',
 'Hello to the world!',
 'Hello to the world!',
 'Welcome to the world!',
 'Congratulations to the world!',
 'Hello

In [None]:
del variator

In [None]:
### Levenshtein Sub-word Edit Distance
measurer = t2t.Measurer()
measurer.transform(["Hello, World! [SEP] Hello, what?"]) #[2]
measurer.transform([
  "Hello, World! [SEP] Hello, what?",
  "안녕하세요. [SEP] 돼지꿈을 꾸세요~~"
], metric="levenshtein_distance")

[2]

In [None]:
del measurer

In [None]:
server = t2t.Server(host='0.0.0.0', port=80)

import time
time.sleep(1)

Creating index with 128104 dimensions.
Serving at http://172.28.0.12/
 * Serving Flask app "text2text.server" (lazy loading)


<text2text.server.Server at 0x7f2cad2988e0>

 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


In [None]:
import socket
import requests

address = socket.gethostbyname(socket.getfqdn(socket.gethostname()))
url = f"http://{address}"
transformer = "Translator"
payload = {
  "input_lines": ["hello", "world"],
  "src_lang": "en",
  "tgt_lang": "ko",
}
r = requests.post(f"{url}/{transformer}", json=payload)
print(r.json())
r = requests.post(f"{url}/Indexer/add", json=payload)
print(r.json())
r = requests.post(f"{url}/Indexer/size", json=payload)
print(r.json())
r = requests.post(f"{url}/Indexer/search", json=payload)
print(r.json())
r = requests.post(f"{url}/Indexer/retrieve", json=payload)
print(r.json())
payload["ids"] = [0,1]
r = requests.post(f"{url}/Indexer/remove", json=payload)
print(r.json())

INFO:werkzeug: * Running on http://0.0.0.0:80/ (Press CTRL+C to quit)
INFO:werkzeug:172.28.0.12 - - [15/Dec/2022 17:29:39] "[37mPOST /translate HTTP/1.1[0m" 200 -


{'result': ['안녕하세요', '세계']}


INFO:werkzeug:172.28.0.12 - - [15/Dec/2022 17:29:40] "[37mPOST /index/add HTTP/1.1[0m" 200 -
INFO:werkzeug:172.28.0.12 - - [15/Dec/2022 17:29:40] "[37mPOST /index/size HTTP/1.1[0m" 200 -


{'result': 'index/add performed'}
{'result': 2}


INFO:werkzeug:172.28.0.12 - - [15/Dec/2022 17:29:41] "[37mPOST /index/search HTTP/1.1[0m" 200 -
INFO:werkzeug:172.28.0.12 - - [15/Dec/2022 17:29:41] "[37mPOST /index/remove HTTP/1.1[0m" 200 -


{'result': [[[0.0, 2.0, 3.4028234663852886e+38], [0.0, 2.0, 3.4028234663852886e+38]], [[1.0, 2.0, -1.0], [2.0, 1.0, -1.0]]]}
{'result': 'index/remove performed'}


In [None]:
del server

In [None]:
### Languages Available
t2t.Transformer.LANGUAGES

{'af': 'Afrikaans',
 'am': 'Amharic',
 'ar': 'Arabic',
 'ast': 'Asturian',
 'az': 'Azerbaijani',
 'ba': 'Bashkir',
 'be': 'Belarusian',
 'bg': 'Bulgarian',
 'bn': 'Bengali',
 'br': 'Breton',
 'bs': 'Bosnian',
 'ca': 'Catalan_Valencian',
 'ceb': 'Cebuano',
 'cs': 'Czech',
 'cy': 'Welsh',
 'da': 'Danish',
 'de': 'German',
 'el': 'Greeek',
 'en': 'English',
 'es': 'Spanish',
 'et': 'Estonian',
 'fa': 'Persian',
 'ff': 'Fulah',
 'fi': 'Finnish',
 'fr': 'French',
 'fy': 'Western_Frisian',
 'ga': 'Irish',
 'gd': 'Gaelic_Scottish_Gaelic',
 'gl': 'Galician',
 'gu': 'Gujarati',
 'ha': 'Hausa',
 'he': 'Hebrew',
 'hi': 'Hindi',
 'hr': 'Croatian',
 'ht': 'Haitian_Haitian_Creole',
 'hu': 'Hungarian',
 'hy': 'Armenian',
 'id': 'Indonesian',
 'ig': 'Igbo',
 'ilo': 'Iloko',
 'is': 'Icelandic',
 'it': 'Italian',
 'ja': 'Japanese',
 'jv': 'Javanese',
 'ka': 'Georgian',
 'kk': 'Kazakh',
 'km': 'Central_Khmer',
 'kn': 'Kannada',
 'ko': 'Korean',
 'lb': 'Luxembourgish_Letzeburgesch',
 'lg': 'Ganda',
 'ln':