<a href="https://colab.research.google.com/github/amaslov455/nlp_project/blob/main/sst_tocsv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytreebank

Collecting pytreebank
  Downloading https://files.pythonhosted.org/packages/e0/12/626ead6f6c0a0a9617396796b965961e9dfa5e78b36c17a81ea4c43554b1/pytreebank-0.2.7.tar.gz
Building wheels for collected packages: pytreebank
  Building wheel for pytreebank (setup.py) ... [?25l[?25hdone
  Created wheel for pytreebank: filename=pytreebank-0.2.7-cp36-none-any.whl size=37070 sha256=d2fdc664dec0b57a4957c1ea76f40a9827920040881e9d00ae82deb7777219b4
  Stored in directory: /root/.cache/pip/wheels/e0/b6/91/e9edcdbf464f623628d5c3aa9de28888c726e270b9a29f2368
Successfully built pytreebank
Installing collected packages: pytreebank
Successfully installed pytreebank-0.2.7


In [2]:
import pytreebank
import pandas as pd

In [3]:
dataset = pytreebank.load_sst()

In [4]:
dataset.keys()

dict_keys(['train', 'test', 'dev'])

In [5]:
dataset['train'][0]

<pytreebank.labeled_trees.LabeledTree at 0x7f4c5545aac8>

In [6]:
def create_df_from_treebank(input_dataset):
  dict_ = {}
  dict_['sentence'] = []
  dict_['santiment'] = []

  list_santiments = ["very_negative", "negative", "neutral", "positive", "very_positive"]

  for part in input_dataset:
    label, sentence = part.to_labeled_lines()[0]

    dict_['sentence'].append(sentence)
    dict_['santiment'].append(list_santiments[label])

  df = pd.DataFrame.from_dict(dict_)
  return df

In [7]:
df_train = create_df_from_treebank(dataset['train'])
df_test = create_df_from_treebank(dataset['test'])

In [8]:
df_train

Unnamed: 0,sentence,santiment
0,The Rock is destined to be the 21st Century 's...,positive
1,The gorgeously elaborate continuation of `` Th...,very_positive
2,Singer/composer Bryan Adams contributes a slew...,positive
3,You 'd think by now America would have had eno...,neutral
4,Yet the act is still charming here .,positive
...,...,...
8539,A real snooze .,very_negative
8540,No surprises .,negative
8541,We 've seen the hippie-turned-yuppie plot befo...,positive
8542,Her fans walked out muttering words like `` ho...,very_negative


In [None]:
df_train.to_csv('/content/drive/MyDrive/diplom_project/train.csv', index = False)
df_test.to_csv('/content/drive/MyDrive/diplom_project/test.csv', index = False)

In [9]:
import nltk
nltk.download('punkt')

joined_sen = ' '.join(df_train['sentence'])

tokens = nltk.word_tokenize(joined_sen)
print('count of all tokens: ', len(tokens))

unique_tokens = list(set(tokens))
print('count of unique tokens: ', len(unique_tokens))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
count of all tokens:  163642
count of unique tokens:  18270


In [10]:
def write_sents_to_txt(list_of_sents, filename):
    with open(filename, 'w',encoding='utf-8') as f:
        for text in list_of_sents:
            f.write(text + "\n")

In [11]:
DIR_TXT_FILE = '/content/drive/MyDrive/diplom_project/train_sents1.txt'

write_sents_to_txt(list(df_train.sentence.values), DIR_TXT_FILE)

In [15]:
!pip install sentencepiece
import sentencepiece as spm

vocab_size_totrain = 10000
spm.SentencePieceTrainer.train('--input={} --model_prefix=m --vocab_size={}'.format(DIR_TXT_FILE, vocab_size_totrain))

sp = spm.SentencePieceProcessor()
sp.load('m.model')

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 5.4MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94


In [None]:
df_train['joined_nltk'] = df_train['sentence'].apply(lambda x: ' '.join(nltk.word_tokenize(x)))
df_train['joined_sentencepiece'] = df_train['sentence'].apply(lambda x: ' '.join(sp.encode_as_pieces(x)))

In [None]:
df_train

Unnamed: 0,sentence,santiment,joined_nltk,joined_sentencepiece
0,The Rock is destined to be the 21st Century 's...,positive,The Rock is destined to be the 21st Century 's...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...
1,The gorgeously elaborate continuation of `` Th...,very_positive,The gorgeously elaborate continuation of `` Th...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...
2,Singer/composer Bryan Adams contributes a slew...,positive,Singer/composer Bryan Adams contributes a slew...,▁S ing er / compos er ▁Br yan ▁Adam s ▁contrib...
3,You 'd think by now America would have had eno...,neutral,You 'd think by now America would have had eno...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...
4,Yet the act is still charming here .,positive,Yet the act is still charming here .,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.
...,...,...,...,...
8539,A real snooze .,very_negative,A real snooze .,▁A ▁real ▁snooze ▁.
8540,No surprises .,negative,No surprises .,▁No ▁surprise s ▁.
8541,We 've seen the hippie-turned-yuppie plot befo...,positive,We 've seen the hippie-turned-yuppie plot befo...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y upp...
8542,Her fans walked out muttering words like `` ho...,very_negative,Her fans walked out muttering words like `` ho...,▁Her ▁fan s ▁walk ed ▁out ▁mut tering ▁word s ...


In [19]:
df_train_2 = df_train[['sentence','santiment']].copy()

In [14]:
len(unique_tokens)//500

36

In [20]:
batch_size = 500
number_of_parts = len(unique_tokens)//batch_size

for i in range(number_of_parts):
  vocab_size_totrain = (i+1) * batch_size

  spm.SentencePieceTrainer.train('--input={} --model_prefix=m --vocab_size={}'.\
                                 format(DIR_TXT_FILE, vocab_size_totrain))
  sp = spm.SentencePieceProcessor()
  sp.load('m.model')
  df_train_2['joined_sentencepiece_{}'.format(vocab_size_totrain)] = df_train_2['sentence'].apply(lambda x: ' '.join(sp.encode_as_pieces(x)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


RuntimeError: ignored

In [21]:
df_train_2

Unnamed: 0,sentence,santiment,joined_sentencepiece_500,joined_sentencepiece_1000,joined_sentencepiece_1500,joined_sentencepiece_2000,joined_sentencepiece_2500,joined_sentencepiece_3000,joined_sentencepiece_3500,joined_sentencepiece_4000,joined_sentencepiece_4500,joined_sentencepiece_5000,joined_sentencepiece_5500,joined_sentencepiece_6000,joined_sentencepiece_6500,joined_sentencepiece_7000,joined_sentencepiece_7500,joined_sentencepiece_8000,joined_sentencepiece_8500,joined_sentencepiece_9000,joined_sentencepiece_9500,joined_sentencepiece_10000,joined_sentencepiece_10500,joined_sentencepiece_11000
0,The Rock is destined to be the 21st Century 's...,positive,▁The ▁R o ck ▁is ▁de s t in ed ▁to ▁be ▁the ▁ ...,▁The ▁Ro ck ▁is ▁de s t in ed ▁to ▁be ▁the ▁ 2...,▁The ▁Ro ck ▁is ▁de s t in ed ▁to ▁be ▁the ▁ 2...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁2 1 s ...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁2 1 s ...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁2 1 s ...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...,▁The ▁Rock ▁is ▁destin ed ▁to ▁be ▁the ▁21 s t...
1,The gorgeously elaborate continuation of `` Th...,very_positive,▁The ▁g or ge ous ly ▁ el ab or ate ▁con t in ...,▁The ▁gorgeous ly ▁ el ab or ate ▁con t in u a...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...,▁The ▁gorgeous ly ▁ e laborat e ▁continu ation...
2,Singer/composer Bryan Adams contributes a slew...,positive,▁S ing er / com p o s er ▁B ry an ▁A d am s ▁c...,▁S ing er / com p o s er ▁Br y an ▁A d am s ▁c...,▁S ing er / com p o s er ▁Br y an ▁A d am s ▁c...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / com p o s er ▁Br y an ▁Adam s ▁con...,▁S ing er / compos er ▁Br y an ▁Adam s ▁contri...,▁S ing er / compos er ▁Br yan ▁Adam s ▁contrib...,▁S ing er / compos er ▁Br yan ▁Adam s ▁contrib...,▁S ing er / compos er ▁Br yan ▁Adam s ▁contrib...,▁S ing er / com po s er ▁Br yan ▁Adam s ▁contr...,▁S ing er / com po s er ▁Br yan ▁Adam s ▁contr...
3,You 'd think by now America would have had eno...,neutral,▁You ▁' d ▁think ▁by ▁no w ▁A mer ic a ▁would ...,▁You ▁' d ▁think ▁by ▁no w ▁A mer ic a ▁would ...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...,▁You ▁' d ▁think ▁by ▁now ▁America ▁would ▁ ha...
4,Yet the act is still charming here .,positive,▁ Y e t ▁the ▁ act ▁is ▁still ▁charm ing ▁her ...,▁ Y e t ▁the ▁ act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.,▁Ye t ▁the ▁act ▁is ▁still ▁charm ing ▁here ▁.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8539,A real snooze .,very_negative,▁A ▁real ▁ s n oo z e ▁.,▁A ▁real ▁ s n oo z e ▁.,▁A ▁real ▁ s n oo ze ▁.,▁A ▁real ▁ s n oo ze ▁.,▁A ▁real ▁ s n oo ze ▁.,▁A ▁real ▁ s n oo ze ▁.,▁A ▁real ▁ s n oo ze ▁.,▁A ▁real ▁ s n oo ze ▁.,▁A ▁real ▁ s n oo ze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.,▁A ▁real ▁snooze ▁.
8540,No surprises .,negative,▁N o ▁ s ur p ri s es ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.,▁No ▁surprise s ▁.
8541,We 've seen the hippie-turned-yuppie plot befo...,positive,▁W e ▁' ve ▁see n ▁the ▁ h i pp ie - t ur n ed...,▁We ▁' ve ▁see n ▁the ▁ h i pp ie - t ur n ed ...,▁We ▁' ve ▁see n ▁the ▁hip p ie - t ur n ed - ...,▁We ▁' ve ▁see n ▁the ▁hip p ie - t ur n ed - ...,▁We ▁' ve ▁see n ▁the ▁hip p ie - t ur ned - y...,▁We ▁' ve ▁see n ▁the ▁hip p ie - t ur ned - y...,▁We ▁' ve ▁see n ▁the ▁hip p ie - t ur ned - y...,▁We ▁' ve ▁see n ▁the ▁hip p ie - turned - y u...,▁We ▁' ve ▁see n ▁the ▁hip p ie - turned - y u...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y up ...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y up ...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y up ...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y up ...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y up ...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y up ...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y up ...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y up ...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y upp...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y upp...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y upp...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y upp...,▁We ▁' ve ▁see n ▁the ▁hippie - turned - y upp...
8542,Her fans walked out muttering words like `` ho...,very_negative,▁H er ▁fa n s ▁w al k ed ▁out ▁ m ut ter ing ▁...,▁H er ▁fan s ▁w al k ed ▁out ▁ m ut ter ing ▁w...,▁H er ▁fan s ▁walk ed ▁out ▁ m ut ter ing ▁wor...,▁Her ▁fan s ▁walk ed ▁out ▁ m ut ter ing ▁word...,▁Her ▁fan s ▁walk ed ▁out ▁ m ut ter ing ▁word...,▁Her ▁fan s ▁walk ed ▁out ▁ m ut ter ing ▁word...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut ter ing ▁word s...,▁Her ▁fan s ▁walk ed ▁out ▁mut tering ▁word s ...,▁Her ▁fan s ▁walk ed ▁out ▁mut tering ▁word s ...,▁Her ▁fan s ▁walk ed ▁out ▁mut tering ▁word s ...,▁Her ▁fan s ▁walk ed ▁out ▁mut tering ▁word s ...,▁Her ▁fan s ▁walk ed ▁out ▁mut tering ▁word s ...


In [27]:
len(set(' '.join(list(df_train_2['joined_sentencepiece_500'])).split()))

540

In [30]:
len(set(' '.join(list(df_train_2['joined_sentencepiece_10000'])).split()))

9853