# Word level one hot encoding

In [1]:
from pathlib import Path

data = Path('data')
movie_lines_file = data / '100lines.txt'

In [2]:
with movie_lines_file.open() as f:
    movie_lines_raw = f.read()

In [3]:
movie_lines_raw

'They do not!\nThey do to!\nI hope so.\nShe okay?\nLet\'s go.\nWow\nOkay -- you\'re gonna need to learn how to lie.\nNo\nI\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\nLike my fear of wearing pastels?\nThe "real you".\nWhat good stuff?\nI figured you\'d get to the good stuff eventually.\nThank God!  If I had to hear one more story about your coiffure...\nMe.  This endless ...blonde babble. I\'m like, boring myself.\nWhat crap?\ndo you listen to this crap?\nNo...\nThen Guillermo says, "If you go any lighter, you\'re gonna look like an extra on 90210."\nYou always been this selfish?\nBut\nThen that\'s all you had to say.\nWell, no...\nYou never wanted to go out with \'me, did you?\nI was?\nI looked for you back at the party, but you always seemed to be "occupied".\nTons\nHave fun tonight?\nI believe we share an art instructor\nYou know Chastity?\nLooks like things worked out tonight, huh?\nHi.\nWho knows?  All I\'ve ever heard her

# Processing

In [4]:
import string
import re

alpha_characters = str.maketrans('', '', string.punctuation)

def clean_tokenize(text):
    text = text.lower()
    text = re.sub(r'\n', '*** ', text)
    text = text.translate(alpha_characters)
    text = re.sub(r' +', ' ', text)
    return text.split(' ')

movie_lines = clean_tokenize(movie_lines_raw)

In [5]:
movie_lines

['they',
 'do',
 'not',
 'they',
 'do',
 'to',
 'i',
 'hope',
 'so',
 'she',
 'okay',
 'lets',
 'go',
 'wow',
 'okay',
 'youre',
 'gonna',
 'need',
 'to',
 'learn',
 'how',
 'to',
 'lie',
 'no',
 'im',
 'kidding',
 'you',
 'know',
 'how',
 'sometimes',
 'you',
 'just',
 'become',
 'this',
 'persona',
 'and',
 'you',
 'dont',
 'know',
 'how',
 'to',
 'quit',
 'like',
 'my',
 'fear',
 'of',
 'wearing',
 'pastels',
 'the',
 'real',
 'you',
 'what',
 'good',
 'stuff',
 'i',
 'figured',
 'youd',
 'get',
 'to',
 'the',
 'good',
 'stuff',
 'eventually',
 'thank',
 'god',
 'if',
 'i',
 'had',
 'to',
 'hear',
 'one',
 'more',
 'story',
 'about',
 'your',
 'coiffure',
 'me',
 'this',
 'endless',
 'blonde',
 'babble',
 'im',
 'like',
 'boring',
 'myself',
 'what',
 'crap',
 'do',
 'you',
 'listen',
 'to',
 'this',
 'crap',
 'no',
 'then',
 'guillermo',
 'says',
 'if',
 'you',
 'go',
 'any',
 'lighter',
 'youre',
 'gonna',
 'look',
 'like',
 'an',
 'extra',
 'on',
 '90210',
 'you',
 'always',
 'be

In [6]:
import numpy as np

movie_line_array = np.array([movie_lines])
movie_line_array= movie_line_array.reshape(-1,1)
movie_line_array.shape

(834, 1)

In [7]:
movie_line_array

array([['they'],
       ['do'],
       ['not'],
       ['they'],
       ['do'],
       ['to'],
       ['i'],
       ['hope'],
       ['so'],
       ['she'],
       ['okay'],
       ['lets'],
       ['go'],
       ['wow'],
       ['okay'],
       ['youre'],
       ['gonna'],
       ['need'],
       ['to'],
       ['learn'],
       ['how'],
       ['to'],
       ['lie'],
       ['no'],
       ['im'],
       ['kidding'],
       ['you'],
       ['know'],
       ['how'],
       ['sometimes'],
       ['you'],
       ['just'],
       ['become'],
       ['this'],
       ['persona'],
       ['and'],
       ['you'],
       ['dont'],
       ['know'],
       ['how'],
       ['to'],
       ['quit'],
       ['like'],
       ['my'],
       ['fear'],
       ['of'],
       ['wearing'],
       ['pastels'],
       ['the'],
       ['real'],
       ['you'],
       ['what'],
       ['good'],
       ['stuff'],
       ['i'],
       ['figured'],
       ['youd'],
       ['get'],
       ['to'],
       ['the'],
 

In [8]:
from sklearn import preprocessing

In [10]:
labelEncoder = preprocessing.LabelEncoder()
movie_line_labels = labelEncoder.fit_transform(movie_line_array)


In [11]:
movie_line_labels

array([313,  77, 219, 313,  77, 322, 157, 151, 285, 278, 224, 184, 113,
       361, 224, 367, 118, 213, 322, 180, 154, 322, 185, 218, 161, 175,
       364, 177, 154, 290, 364, 173,  26, 317, 236,  10, 364,  80, 177,
       154, 322, 255, 188, 208,  99, 222, 342, 235, 309, 259, 364, 348,
       119, 297, 157, 100, 365, 109, 322, 309, 119, 297,  91, 306, 114,
       159, 157, 128, 322, 137, 226, 206, 296,   2, 366,  53, 200, 317,
        89,  31,  21, 161, 188,  35, 209, 348,  62,  77, 364, 191, 322,
       317,  62, 218, 310, 123, 268, 159, 364, 113,  12, 187, 367, 118,
       193, 188,   9,  96, 225,   0, 364,   8,  27, 317, 275,  40, 310,
       308,   7, 364, 128, 322, 267, 345, 218, 364, 214, 339, 322, 113,
       231, 354, 200,  72, 364, 157, 340, 157, 194, 102, 364,  22,  18,
       309, 234,  40, 364,   8, 273, 322,  24, 221, 324, 133, 106, 323,
       157,  30, 341, 277,   9,  15, 165, 364, 177,  48, 195, 188, 315,
       358, 231, 323, 156, 145, 352, 178,   7, 170,  92, 138, 14

In [12]:
type(movie_line_labels)

numpy.ndarray

In [13]:
movie_line_labels.shape

(834,)

In [14]:
wordOneHotEncoder = preprocessing.OneHotEncoder()

line_onehot = wordOneHotEncoder.fit_transform(movie_line_labels.reshape(-1,1))

In [16]:
line_onehot.shape

(834, 368)

In [17]:
type(line_onehot)

scipy.sparse.csr.csr_matrix

In [19]:
line_onehot.toarray().shape

(834, 368)

In [20]:
line_onehot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])