In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import math
import os
import torch

In [3]:
def dip_load_data(f_path, f_name):
    '''import data from file and make the text lower case
    Input
    f_path: string - file path
    f_name: string - file name
    Output
    raw text: string 
    '''
    
    file = f_path + '/' + f_name
    raw_text = open(file, 'r', encoding='utf-8').read().lower()
    return raw_text

if __name__ == '__main__':
    data = dip_load_data('/Users/danielboda', 'wonderland.txt')
    print(data[:100])

alice’s adventures in wonderland

by lewis carroll

chapter i.
down the rabbit-hole


alice was begi


In [4]:
def dip_chars_dict(text):
    '''set of chars with the corresponding mapping dicts
    Input
    text: sting - raw text
    Output
    character: string, character to integer: dict, integer to character: dict
    '''
    
    chars = sorted(list(set(text)))
    char_to_int = dict((c, i) for i, c in enumerate(chars))
    int_to_char = dict((i, c) for c, i in char_to_int.items())
    return chars, char_to_int, int_to_char

if __name__ == '__main__':
    chars, char_to_int, int_to_char = dip_chars_dict(data)
    print( chars, char_to_int, int_to_char, sep = '\n')

['\n', ' ', '!', '(', ')', '*', ',', '-', '.', ':', ';', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ù', '—', '‘', '’', '“', '”']
{'\n': 0, ' ': 1, '!': 2, '(': 3, ')': 4, '*': 5, ',': 6, '-': 7, '.': 8, ':': 9, ';': 10, '?': 11, '[': 12, ']': 13, '_': 14, 'a': 15, 'b': 16, 'c': 17, 'd': 18, 'e': 19, 'f': 20, 'g': 21, 'h': 22, 'i': 23, 'j': 24, 'k': 25, 'l': 26, 'm': 27, 'n': 28, 'o': 29, 'p': 30, 'q': 31, 'r': 32, 's': 33, 't': 34, 'u': 35, 'v': 36, 'w': 37, 'x': 38, 'y': 39, 'z': 40, 'ù': 41, '—': 42, '‘': 43, '’': 44, '“': 45, '”': 46}
{0: '\n', 1: ' ', 2: '!', 3: '(', 4: ')', 5: '*', 6: ',', 7: '-', 8: '.', 9: ':', 10: ';', 11: '?', 12: '[', 13: ']', 14: '_', 15: 'a', 16: 'b', 17: 'c', 18: 'd', 19: 'e', 20: 'f', 21: 'g', 22: 'h', 23: 'i', 24: 'j', 25: 'k', 26: 'l', 27: 'm', 28: 'n', 29: 'o', 30: 'p', 31: 'q', 32: 'r', 33: 's', 34: 't', 35: 'u', 36: 'v', 37: 'w', 38: 'x', 39: '

In [5]:
def dip_create_data(text, chars, char_to_int, seq_length = 100, verbose = False):
    '''create design matrix and response vector from raw text using specific window size
    Input
    text: string - raw text
    chars: string - characters
    char_to_int: dict - mapping from character to integer
    seq_length: int - sequence length (example: XXXY/First three training/fourth label => seq_length = 3) 
    verbose: bool - print
    Output
    X: 2d list - training sample, y: list - label vector, number of pattern: int - example above equal one, number of vocab: int
    '''
    
    n_chars = len(text)
    n_vocab = len(chars)
    X = []
    y = []

    for i in range(0, n_chars - seq_length, 1):
        X.append([char_to_int[char] for char in text[i:i + seq_length]])
        y.append(char_to_int[text[i + seq_length]])

    n_patterns = len(X)
    
    if verbose:
        print("\nTotal Characters: ", n_chars, "Total Vocab: ", n_vocab, "Total Patterns: ", n_patterns, '\n', sep = '\n')

    return X, y, n_patterns, n_vocab

if __name__ == '__main__':
    X, y, n_patterns, n_vocab = dip_create_data(data, chars, char_to_int, verbose = True)   


Total Characters: 
144059
Total Vocab: 
47
Total Patterns: 
143959




In [6]:
def dip_normalization_reshape(X, y, n_patterns, n_vocab, seq_length, verbose = False):
    '''create stensors, normalize X with number of vocab
    Input
    X: 2d list - training data 
    y: list - label vector 
    n_vocab: int - number of vocab
    seq_length: int - sequence length (example: XXXY/First three training/fourth label => seq_length = 3) 
    verbose: bool - print 
    Output
    X: 3d tensor: training data shape of (batch size/sequence length/1), y: 1d tensor: label vector
    '''
    
    X = torch.tensor(X, dtype=torch.float32).reshape(n_patterns, seq_length, 1)
    X = X / float(n_vocab)
    y = torch.tensor(y)

    if verbose:
        print('\nSize of X:', X.shape, 'Size of y:', y.shape, '\n')

    return X, y

if __name__ == '__main__':
    X, y = dip_normalization_reshape(X, y, n_patterns, n_vocab, 100, verbose = True) 


Size of X: torch.Size([143959, 100, 1]) Size of y: torch.Size([143959]) 

