<a href="https://colab.research.google.com/github/amaslov455/nlp_project/blob/main/sst_tocsv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pytreebank

Collecting pytreebank
  Downloading https://files.pythonhosted.org/packages/e0/12/626ead6f6c0a0a9617396796b965961e9dfa5e78b36c17a81ea4c43554b1/pytreebank-0.2.7.tar.gz
Building wheels for collected packages: pytreebank
  Building wheel for pytreebank (setup.py) ... [?25l[?25hdone
  Created wheel for pytreebank: filename=pytreebank-0.2.7-cp37-none-any.whl size=37070 sha256=8881648f3d8eadba675f91d524ce07f52e5699ab92c94b0ebc74a8092d75db36
  Stored in directory: /root/.cache/pip/wheels/e0/b6/91/e9edcdbf464f623628d5c3aa9de28888c726e270b9a29f2368
Successfully built pytreebank
Installing collected packages: pytreebank
Successfully installed pytreebank-0.2.7


In [None]:
import pytreebank
import pandas as pd

In [None]:
dataset = pytreebank.load_sst()
fine_grained = False

In [None]:
dataset.keys()

dict_keys(['train', 'test', 'dev'])

In [None]:
def create_df_from_treebank(input_dataset):
  dict_ = {}
  dict_['sentence'] = []
  dict_['santiment'] = []

  list_santiments = ["very_negative", "negative", "neutral", "positive", "very_positive"]

  for part in input_dataset:
    label, sentence = part.to_labeled_lines()[0]

    dict_['sentence'].append(sentence)
    dict_['santiment'].append(list_santiments[label])

  df = pd.DataFrame.from_dict(dict_)
  return df

In [None]:
df_train = create_df_from_treebank(dataset['train'])
df_test = create_df_from_treebank(dataset['test'])
df_valid = create_df_from_treebank(dataset['dev'])

In [None]:
# Create SST-2 from SST-5
# Use only if needed

def no_fine_grainedSST(df):
  df_return = df.copy()
  df_return = df_return[df_return['santiment'] != 'neutral']
  df_return = df_return.replace({'santiment': {'very_positive': 'positive', 'very_negative': 'negative'}})
  return df_return

if fine_grained:
  df_train = no_fine_grainedSST(df_train)
  df_test = no_fine_grainedSST(df_test)
  df_valid = no_fine_grainedSST(df_valid)

In [None]:
print(set(df_train['santiment']))

{'very_positive', 'very_negative', 'positive', 'neutral', 'negative'}


In [None]:
df_train.to_csv('/content/drive/MyDrive/diplom_project/train.csv', index = False)
df_test.to_csv('/content/drive/MyDrive/diplom_project/test.csv', index = False)
df_valid.to_csv('/content/drive/MyDrive/diplom_project/valid.csv', index = False)

In [None]:
import nltk
nltk.download('punkt')

joined_sen = ' '.join(df_train['sentence'])

tokens = nltk.word_tokenize(joined_sen)
print('count of all tokens: ', len(tokens))

unique_tokens = list(set(tokens))
print('count of unique tokens: ', len(unique_tokens))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
count of all tokens:  163642
count of unique tokens:  18270


In [None]:
def write_sents_to_txt(list_of_sents, filename):
    with open(filename, 'w',encoding='utf-8') as f:
        for text in list_of_sents:
            f.write(text + "\n")

In [None]:
# Export df_train['sentence'] to txt
DIR_TXT_FILE = '/content/drive/MyDrive/diplom_project/sents_to_trainSPM.txt'

write_sents_to_txt(list(df_train['sentence'].values), DIR_TXT_FILE)

In [None]:
# Test sentencepiece does it work fine
# Use only to test library
!pip install sentencepiece
import sentencepiece as spm

vocab_size_totrain = 10000
spm.SentencePieceTrainer.train('--input={} --model_prefix=m --vocab_size={}'.format(DIR_TXT_FILE, vocab_size_totrain))

sp = spm.SentencePieceProcessor()
sp.load('m.model')

df_train['joined_nltk'] = df_train['sentence'].apply(lambda x: ' '.join(nltk.word_tokenize(x)))
df_train['joined_sentencepiece'] = df_train['sentence'].apply(lambda x: ' '.join(sp.encode_as_pieces(x)))

print(df_train)

In [None]:
len(unique_tokens)//500

36

In [None]:
len(unique_tokens)

18270

In [None]:
# Divide all datasets to custom number of tokens

!pip install sentencepiece
import sentencepiece as spm
from tqdm import tqdm

step_size = 500
number_of_parts = len(unique_tokens)//step_size

for i in tqdm(range(number_of_parts)):
  vocab_size_totrain = (i+1) * step_size

  # spm.SentencePieceTrainer.train('--input={} --model_prefix=m --vocab_size={}'.\
  #                                format(DIR_TXT_FILE, vocab_size_totrain))
  spm.SentencePieceTrainer.train(input=DIR_TXT_FILE,
                                 model_prefix='m',
                                 vocab_size=vocab_size_totrain)
  sp = spm.SentencePieceProcessor()
  sp.load('m.model')

  df_train['joined_spm_{}'.format(vocab_size_totrain)] = df_train['sentence'].apply(lambda x: ' '.join(sp.encode_as_pieces(x)))
  df_test['joined_spm_{}'.format(vocab_size_totrain)] = df_test['sentence'].apply(lambda x: ' '.join(sp.encode_as_pieces(x)))
  df_valid['joined_spm_{}'.format(vocab_size_totrain)] = df_valid['sentence'].apply(lambda x: ' '.join(sp.encode_as_pieces(x)))

Collecting sentencepiece
  Using cached https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


 61%|██████    | 22/36 [00:43<00:18,  1.34s/it]

RuntimeError: ignored

In [None]:
len(set(' '.join(list(df_test['joined_spm_1000'])).split()))

1018

In [None]:
# Export train/test/validation datasets to csv

df_train.to_csv('/content/drive/MyDrive/diplom_project/SST5_SPM_train.csv', index = False)
df_test.to_csv('/content/drive/MyDrive/diplom_project/SST5_SPM_test.csv', index = False)
df_valid.to_csv('/content/drive/MyDrive/diplom_project/SST5_SPM_valid.csv', index = False)