In [None]:
import numpy as np 
import torch
import os 
import nltk
import random
import torch.cuda as cuda
import pickle
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices()) # Show the GPU & CPU Specifications

from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3924550064263071265
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 15505193728
locality {
  bus_id: 1
  links {
  }
}
incarnation: 323020358180229692
physical_device_desc: "device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:04.0, compute capability: 7.0"
]
Mounted at /content/drive


In [None]:
### division ###

train_samples_path = "/content/drive/MyDrive/sentiment/train.ft.txt"

file = open(train_samples_path, "r")
train = []
for line in file.readlines():
  train.append( line )
file.close()

random.shuffle(train)

with open("/content/drive/MyDrive/sentiment/train_data.txt", "w") as output:
    output.writelines(train[:780000])
with open("/content/drive/MyDrive/sentiment/validation_data.txt", "w") as output:
    output.writelines(train[780000:1000000])    


In [None]:
train_samples_path = "/content/drive/MyDrive/sentiment/train_data.txt"
validation_samples_path = "/content/drive/MyDrive/sentiment/validation_data.txt"
test_samples_path = "/content/drive/MyDrive/sentiment/test.ft.txt"

file = open(train_samples_path, "r")
train_data = []
train_label = []
for line in file.readlines():
  x = line[11:-1].lower().replace(".", " . ")   
  train_data.append( word_tokenize(x) )
  train_label.append(int(line[9]) - 1)
file.close()

file = open(validation_samples_path, "r")
validation_data = []
validation_label = []
for line in file.readlines():
  x = line[11:-1].lower().replace(".", " . ")   
  validation_data.append( word_tokenize(x) )
  validation_label.append(int(line[9]) - 1)
file.close()

file = open(test_samples_path, "r")
test_data = []
test_label = []
for line in file.readlines():
  x = line[11:-1].lower().replace(".", " . ")   
  test_data.append( word_tokenize(x) )
  test_label.append(int(line[9]) - 1)
file.close()

#### Train data is 2D array and train label is 1D array

print("Sample 0 of train data : {}".format(train_data[0]))
print("Sample 0 of train label : {}".format(train_label[0]))

#### Validation data is 2D array and train label is 1D array

print("Sample 0 of validation data : {}".format(validation_data[0]))
print("Sample 0 of validation label : {}".format(validation_label[0]))

#### Test data is 2D array and train label is 1D array

print("Sample 0 of test data : {}".format(test_data[0]))
print("Sample 0 of test label : {}".format(test_label[0]), end = "\n")


print("Lenght of train data is {}".format(len(train_data)))
print("Lenght of validation data is {}".format(len(validation_data)))
print("Lenght of test data is {}".format(len(test_data)))

In [None]:
vocab = ['UNK', 'PAD']
vocab_repeat = [0, 0] # Number of repetitions of vocabs

### Creating Vocabulary from training data 
for sample in train_data:
  for word in sample:
    if word not in vocab:
      vocab.append(word)
      vocab_repeat.append(1)
    else:
      index = vocab.index(word)
      vocab_repeat[index] += 1   

print('done')

#### Vocabs with less than 10 repetitions will be replaced with UNK.
Minimum_of_repetition = 10
vocab_remove = []
for index_sample, sample in enumerate(train_data):
  # if index_sample % 50000 == 0:
  #   print(index_sample)
  for index_word, word in enumerate(sample):
    if  vocab_repeat[vocab.index(word)] < Minimum_of_repetition:
        train_data[index_sample][index_word] = 'UNK'
        if word not in vocab_remove:
          vocab_remove.append(word)

for word in vocab_remove:
  index = vocab.index(word)
  vocab.remove(word)
  del(vocab_repeat[index])

print("The size of the vocabulary is : {}".format(len(vocab)))
print("The vocab is : \n{}\n".format(vocab))
print("The vocab repeat is : \n{}\n".format(vocab_repeat))




In [None]:
## Evaluate sizes - calculating MIN, MAX, MEANâ€Œ, STD
mean_size_of_samples = 0
std_of_samples = 0  
max = 0
min =100000

for sample in train_data:
  mean_size_of_samples += len(sample)
  if len(sample) > max :
    max = len(sample)
  if len(sample) < min :
    min = len(sample)

mean_size_of_samples = mean_size_of_samples / len(train_data)

for sample in train_data:
  std_of_samples += (len(sample) - mean_size_of_samples)**2
std_of_samples = np.sqrt( std_of_samples / len(train_data) )

print("Train:  Max sample length is : {}".format(max))
print("Train: Min sample length is : {}".format(min))
print("Train: Mean sample length is : {}".format(mean_size_of_samples))
print("Train: Std of lenghts is : {}".format(std_of_samples))

sU = mean_size_of_samples + 3*std_of_samples

print("Mean + 3 * Std  is : {}".format(sU))


### Check : How many train samples have lenght <= SU
count = 0
for sample in train_data:
  if ( len(sample) <= int(sU)  ):
    count += 1
print("It is {}% of samples".format(count/len(train_data) * 100 ))


Train:  Max sample length is : 501
Train: Min sample length is : 12
Train: Mean sample length is : 92.70222692307692
Train: Std of lenghts is : 50.23249259158384
Mean + 3 * Std  is : 243.39970469782844
It is 99.96358974358974% of samples


In [None]:
## ADD Padding and Cutting the long samples 
seq_lenght = int(sU) + 1
print("The sequence lenght is {}".format(seq_lenght))

## If the size of sample is less than seq_lenght, we add enough <pad> to reach the size of seq_lenght. 
## Or if the size of sample is bigger than seq_lenght, we just use the first seq_lenght vocabs. 

for idx_sample, sample in enumerate(train_data):
  if len(sample) < seq_lenght:
    for i in range( seq_lenght -len(sample) ):
      train_data[idx_sample].append('PAD')
  else :
      train_data[idx_sample] = train_data[idx_sample][0:seq_lenght]

for idx_sample, sample in enumerate(validation_data):
  if len(sample) < seq_lenght:
    for i in range( seq_lenght -len(sample) ):
      validation_data[idx_sample].append('PAD')
  else :
      validation_data[idx_sample] = validation_data[idx_sample][0:seq_lenght]

for idx_sample, sample in enumerate(test_data):
  if len(sample) < seq_lenght:
    for i in range( seq_lenght -len(sample) ):
      test_data[idx_sample].append('PAD')
  else :
      test_data[idx_sample] = test_data[idx_sample][0:seq_lenght]


The sequence lenght is 244


In [None]:
## Convert Vocabs to their indexes in Vocab. 

for idx_sample, sample in enumerate(train_data):
  for idx_word, word in enumerate(sample):
    train_data[idx_sample][idx_word] = vocab.index(word)    


for idx_sample, sample in enumerate(validation_data):
  for idx_word, word in enumerate(sample):
    if word not in vocab:
      validation_data[idx_sample][idx_word] = 0 ## UNK for vocabs that are not in the Vocab.
    else:
      validation_data[idx_sample][idx_word] = vocab.index(word)    


for idx_sample, sample in enumerate(test_data):
  for idx_word, word in enumerate(sample):
    if word not in vocab:
      test_data[idx_sample][idx_word] = 0 ## UNK for vocabs that are not in the Vocab.
    else:
      test_data[idx_sample][idx_word] = vocab.index(word)    



In [None]:
#### Train data is 2D array and Train label is 1D array
print("Sample 0 of train data : {}".format(train_data[0]))
print("Sample 0 of train label : {}".format(train_label[0]))

#### Validation data is 2D array and Validation label is 1D array

print("Sample 0 of validation data : {}".format(validation_data[0]))
print("Sample 0 of validation label : {}".format(validation_label[0]))

#### Test data is 2D array and Test label is 1D array

print("Sample 0 of test data : {}".format(test_data[0]))
print("Sample 0 of test label : {}".format(test_label[0]), end = "\n")


In [None]:
with open("/content/drive/MyDrive/sentiment/vocab_new.txt", "wb") as fp:   
   pickle.dump(vocab, fp)

with open("/content/drive/MyDrive/sentiment/train_data_indexed_new.txt", "wb") as fp:   
   pickle.dump(train_data, fp)
with open("/content/drive/MyDrive/sentiment/validation_data_indexed_new.txt", "wb") as fp:   
   pickle.dump(validation_data, fp)
with open("/content/drive/MyDrive/sentiment/test_data_indexed_new.txt", "wb") as fp:   
   pickle.dump(test_data, fp)



with open("/content/drive/MyDrive/sentiment/train_lebel_new.txt", "wb") as fp:   
   pickle.dump(train_label, fp)
with open("/content/drive/MyDrive/sentiment/validation_label_new.txt", "wb") as fp:  
   pickle.dump(validation_label, fp)
with open("/content/drive/MyDrive/sentiment/test_label_new.txt", "wb") as fp:  
   pickle.dump(test_label, fp)

print("Done !!!")


Done !!!
