# Pre-process all the data

In [1]:
# Corpus reader:
import numpy as np
import random

import os
root = './Confs_newline/Conf1/'
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader(root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8')

In [2]:
print(reader.categories())
print(reader.fileids())

['kiz', 'kork', 'mutlu']
['kiz.txt', 'kork.txt', 'mutlu.txt']


In [3]:
### First, tokenize Punctuation: 
# create a token dictionary:
punc_dict= {'.':'||PERIOD||', ',': '||COMMA||', '"': '||QUOTATION_MARK||', ';': '||SEMICOLON||',
                '!': '||EXCLAMATION_MARK||', '?': '||QUESTION_MARK||', '(': '||LEFT_PAREN||',
                ')': '||RIGHT_PAREN||', '?': '||QUESTION_MARK||', 
                '\n': '||NEW_LINE||', '-': '||DASH||'}

In [4]:
def sent_tokenize_whole_tweets(text): # raw text --> whole tweets file content
    for key, token in punc_dict.items():
        text = text.replace(key, ' {} '.format(token))

    sentences= []
    for line in text.split('||NEW_LINE||'):
        line= line.strip()
        sentences.append(line)
    return sentences

In [5]:
all_text=[]
labels= []

In [6]:
for label,file_name in zip(reader.categories(), reader.fileids()):
    sentences= sent_tokenize_whole_tweets(reader.raw(file_name)) # --> this should return a list of contents
    labels.extend([label for i in sentences])
    all_text.extend([i.lower() for i in sentences])

# Now, we have all tweets in all_text list!

## Transforming Text into Numbers

In [7]:
word_counts={}
for i in range(len(all_text)):
    for word in all_text[i].split(" "):
        word_counts[word] = word_counts.get(word,0) +1

vocab = set(word_counts.keys())
vocab_size = len(vocab)

print("Number of unique words: {} ".format(vocab_size))
print("Number of tweets: {}".format(len(all_text)))
assert len(all_text)== len(labels), "Each tweet should have a label."

sorted_word_counts= sorted(word_counts, key= word_counts.get, reverse= True)

int_to_vocab= {ii: word for ii,word in enumerate(sorted_word_counts)}
vocab_to_int= {word: ii for ii, word in int_to_vocab.items()}

Number of unique words: 7426 
Number of tweets: 2170


In [8]:
all_text[2]

'hayır çok severim diziyi yani aşırı kızdım'

In [9]:
int_to_vocab[0]

''

# Reducing Noise in the Input Data

## Subsampling

Words that show up often such as "the", "of", and "for" don't provide much context to the nearby words. If we discard some of them, we can remove some of the noise from our data and in return get faster training and better representations. This process is called subsampling by Mikolov. For each word $w_i$ in the training set, we'll discard it with probability given by 

$$ P(w_i) = 1 - \sqrt{\frac{t}{f(w_i)}} $$

where $t$ is a threshold parameter and $f(w_i)$ is the frequency of word $w_i$ in the total dataset.

$$ P(0) = 1 - \sqrt{\frac{1*10^{-5}}{1*10^6/16*10^6}} = 0.98735 $$

In [10]:
int_words = [vocab_to_int[word] for word in vocab]
print(int_words[:5])

[0, 4175, 2857, 2991, 4747]


In [11]:
threshold = 1e-4

word_counts_intwords = {vocab_to_int[word]:count for word,count in word_counts.items()}
total_count = vocab_size
freqs = {word: count/total_count for word, count in word_counts_intwords.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts_intwords}
# discard some frequent words, according to the subsampling equation
# create a new list of words for training

selected_words = [word for word in word_counts_intwords if random.random() < (1 - p_drop[word])]
print(selected_words[:30])

[2139, 2140, 142, 2143, 97, 2144, 1225, 2145, 2146, 842, 632, 1226, 172, 2148, 18, 2149, 406, 2150, 844, 1229, 2152, 2153, 636, 2154, 2155, 1231, 2156, 2157, 63, 2159]


In [12]:
len(selected_words)

5552

In [13]:
all_text[0]

'çok kızdım ne kadar ayıp şey'

In [14]:
for i in range(len(all_text)):
    line_split= all_text[i].split(" ")
    line_split_reduced= [word for word in line_split if vocab_to_int[word] in selected_words]
    all_text[i]= ' '.join(line_split_reduced)


# Delete empty tweets:

In [15]:
len(all_text)

2170

In [17]:
all_text= [line for line in all_text if len(line)>0]
labels= [labels[i] for i in range(len(all_text)) if len(all_text[i])>0]

print("Number of tweets after reducing some noise: {}".format(len(all_text)))
assert len(all_text)== len(labels), "Each tweet should have a label."

Number of tweets after reducing some noise: 2059


# Encoding the words
The embedding lookup requires that we pass in integers to our network. The easiest way to do this is to create dictionaries that map the words in the vocabulary to integers. Then we can convert each of our reviews into integers so they can be passed into the network.

In [18]:
word_counts={}
for i in range(len(all_text)):
    for word in all_text[i].split(" "):
        word_counts[word] = word_counts.get(word,0) +1

vocab = set(word_counts.keys())
vocab_size = len(vocab)
print("Number of unique words: {} ".format(vocab_size))

sorted_word_counts= sorted(word_counts, key= word_counts.get, reverse= True)

int_to_vocab= {ii: word for ii,word in enumerate(sorted_word_counts, 1)} #start from 1.
vocab_to_int= {word: ii for ii, word in int_to_vocab.items()}


Number of unique words: 5552 


In [19]:
## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
encoded_tweets = []
for tweet in all_text:
    encoded_tweets.append([vocab_to_int[word] for word in tweet.split(" ")])

# Encoding the labels

In [24]:
def map_label(l):
    if l== 'kiz':
        return 0
    elif l== 'kork':
        return 1
    return 2  

In [25]:
encoded_labels = np.array([map_label(label) for label in labels])

# Padding sequences

To deal with both short and very long reviews, we'll pad or truncate all our reviews to a specific length. For reviews shorter than some seq_length, we'll pad with 0s. For reviews longer than seq_length, we can truncate them to the first seq_length words. A good seq_length, in this case, is 200.

    Exercise: Define a function that returns an array features that contains the padded data, of a standard size, that we'll pass to the network.

        The data should come from review_ints, since we want to feed integers to the network.
        Each row should be seq_length elements long.
        For reviews shorter than seq_length words, left pad with 0s. That is, if the review is ['best', 'movie', 'ever'], [117, 18, 128] as integers, the row will look like [0, 0, 0, ..., 0, 117, 18, 128].
        For reviews longer than seq_length, use only the first seq_length words as the feature vector.

As a small example, if the seq_length=10 and an input review is:

[117, 18, 128]

The resultant, padded sequence should be:

[0, 0, 0, 0, 0, 0, 0, 117, 18, 128]

Your final features array should be a 2D array, with as many rows as there are reviews, and as many columns as the specified seq_length.

This isn't trivial and there are a bunch of ways to do this. But, if you're going to be building your own deep learning networks, you're going to have to get used to preparing your data.

In [29]:
max_lenght= max([len(i) for i in encoded_tweets])
max_lenght

14

In [30]:
min_lenght= min([len(i) for i in encoded_tweets])
min_lenght

1

In [31]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [32]:
# Test your implementation!

seq_length = 10

features = pad_features(encoded_tweets, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(encoded_tweets), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

In [40]:
features.shape[0]

2059

In [41]:
features.shape[1]

10