# Pre-process all the data

Running the code cell below will pre-process all the data and save it to file. You're encouraged to lok at the code for `preprocess_and_save_data` in the `helpers.py` file to see what it's doing in detail, but you do not need to change this code.

In [60]:
# Corpus reader:
import os
root = './Confs_newline/Conf2/'
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
reader = CategorizedPlaintextCorpusReader(root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8')

In [61]:
print(reader.categories())
print(reader.fileids())

['kiz', 'kork', 'mutlu', 'notr', 'uzul']
['kiz.txt', 'kork.txt', 'mutlu.txt', 'notr.txt', 'uzul.txt']


In [62]:
### First, tokenize Punctuation: 
# create a token dictionary:
punc_dict= {'.':'||PERIOD||', ',': '||COMMA||', '"': '||QUOTATION_MARK||', ';': '||SEMICOLON||',
                '!': '||EXCLAMATION_MARK||', '?': '||QUESTION_MARK||', '(': '||LEFT_PAREN||',
                ')': '||RIGHT_PAREN||', '?': '||QUESTION_MARK||', 
                '\n': '||NEW_LINE||', '-': '||DASH||'}

In [63]:
def sent_tokenize_whole_tweets(text): # raw text --> whole tweets file content
    for key, token in punc_dict.items():
        text = text.replace(key, ' {} '.format(token))

    sentences= []
    for line in text.split('||NEW_LINE||'):
        line= line.strip()
        sentences.append(line)
    return sentences

In [64]:
all_text=[]
labels= []

In [65]:
for label,file_name in zip(reader.categories(), reader.fileids()):
    sentences= sent_tokenize_whole_tweets(reader.raw(file_name)) # --> this should return a list of contents
    labels.extend([label for i in sentences])
    all_text.extend([i.lower() for i in sentences])
print(len(labels))
print(len(all_text))
# Now, we have all tweets in all_text list!

3317
3317


## Transforming Text into Numbers

In [66]:
word_counts={}
for i in range(len(all_text)):
    for word in all_text[i].split(" "):
        word_counts[word] = word_counts.get(word,0) +1

vocab = set(word_counts.keys())
vocab_size = len(vocab)
print("Number of unique words: {} ".format(vocab_size))

sorted_word_counts= sorted(word_counts, key= word_counts.get, reverse= True)

int_to_vocab= {ii: word for ii,word in enumerate(sorted_word_counts)}
vocab_to_int= {word: ii for ii, word in int_to_vocab.items()}


Number of unique words: 3704 


In [67]:
all_text[0]

'çok kız ||exclamation_mark||  ne kadar ayıp şey'

In [68]:
labels[0]

'kiz'