In [45]:
#!g1.1
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

import string
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import annoy
from gensim.models import Word2Vec, FastText
import pickle
from tqdm import tqdm_notebook
import gensim.downloader as api

from collections import Counter

import tensorflow as tf

import os
import math

In [46]:
#!g1.1
s1 = pd.read_csv('House MD/season1.csv', encoding= 'unicode_escape')
s2 = pd.read_csv('House MD/season2.csv', encoding= 'unicode_escape')
s3 = pd.read_csv('House MD/season3.csv', encoding= 'unicode_escape')
s4 = pd.read_csv('House MD/season4.csv', encoding= 'unicode_escape')
s5 = pd.read_csv('House MD/season5.csv', encoding= 'unicode_escape')
s6 = pd.read_csv('House MD/season6.csv', encoding= 'unicode_escape')
s7 = pd.read_csv('House MD/season7.csv', encoding= 'unicode_escape')
s8 = pd.read_csv('House MD/season8.csv', encoding= 'unicode_escape')

In [47]:
#!g1.1
all_transcripts = [s1, s2, s3, s4, s5, s6, s7, s8]

house_md_df = pd.concat(all_transcripts)

In [48]:
#!g1.1
house_md_df

Unnamed: 0,name,line
0,Melanie,Why are you late?
1,Rebecca,You're not going to like the answer.
2,Melanie,I already know the answer.
3,Rebecca,I missed the bus.
4,Melanie,"I don't doubt it, no bus stops near Brad's. Y..."
...,...,...
8034,House,Just switched the dental records.
8035,Wilson,You're destroying your entire life. You can't...
8036,House,"I'm dead, Wilson. How do you want to spend yo..."
8037,Wilson,When the cancer starts getting really bad!


In [49]:
#!g1.1
morpher = MorphAnalyzer()
sw = set(get_stop_words("en"))
exclude = set(string.punctuation)

In [50]:
#!g1.1
corpus = []
for line in house_md_df.line:
    if isinstance(line, str):
        corpus.append(line)
    
corpus = ''.join(corpus)

In [51]:
#!g1.1
class CharDataset:

    def __init__(self, data, block_size):
        chars = sorted(list(set(data)))
        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))
        
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data

        self.tensors = []
        for n in range(self.__len__()):
            i = n * self.block_size
            chunk = self.data[i:i+self.block_size+1]
            dix = [self.stoi[s] for s in chunk]
            x = tf.convert_to_tensor(dix[:-1], dtype=tf.int32)
            y = tf.convert_to_tensor(dix[1:], dtype=tf.int32)
            self.tensors.append((x,y))
            
    def convert_text(self, text):
        dix = [self.stoi[s] for s in text]

        return tf.convert_to_tensor(dix, dtype=tf.int32)
    
    def __len__(self):
        return math.ceil(len(self.data) / (self.block_size + 1))

    def __iter__(self):
        for x,y in self.tensors:
          yield x, y
        
    __call__ = __iter__

In [52]:
#!g1.1
block_size = 128 
train_dataset_gen = CharDataset(corpus, block_size) 

data has 5248653 characters, 148 unique.


2023-06-06 13:06:56.567054: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-06 13:06:59.796748: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30995 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:8c:00.0, compute capability: 7.0


In [53]:
#!g1.1
train_dataset = tf.data.Dataset.from_generator(train_dataset_gen,
  output_signature=(
         tf.TensorSpec(shape=(block_size), dtype=tf.int32),
         tf.TensorSpec(shape=(block_size), dtype=tf.int32)))

In [54]:
#!g1.1
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [55]:
#!g1.1
# Length of the vocabulary in chars
vocab = sorted(set(corpus))
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 128

# Number of RNN units
rnn_units = 1024

In [56]:
#!g1.1
def build_model_lstm(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
                                 
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),

        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),

         tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
                                   
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [57]:
#!g1.1
model_lstm = build_model_lstm(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE)

In [58]:
#!g1.1
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [59]:
#!g1.1
model_lstm.compile(optimizer='adam', loss=loss)

In [60]:
#!g1.1
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_freq=88*3,
    save_weights_only=True)

In [69]:
#!g1.1
EPOCHS = 10
history = model_lstm.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
model_lstm.save_weights("bot_lstm_weights")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [65]:
#!g1.1
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)#!g1.1

In [74]:
#!g1.1
def generate_text2(model, start_string):
    input_eval = train_dataset_gen.convert_text(start_string)
    input_eval = tf.expand_dims(input_eval, 0)
    
#     num_generate = 50
    
    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 0.5

    model.reset_states()
    last_char = ''
    while last_char not in ['!', '.', '?']:
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        last_char = train_dataset_gen.itos[predicted_id]
        text_generated.append(last_char)

    return (start_string + ''.join(text_generated))

In [86]:
#!g1.1
model_lstm_pred = build_model_lstm(
    vocab_size=len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=1)

model_lstm_pred.load_weights("bot_lstm_weights").expect_partial()

text_ = generate_text2(model_lstm_pred, start_string=u"It's not Lupus!")
print(text_)

It's not Lupus! Well, if you want to get the lique in front of the TV?


In [83]:
#!g1.1


In [None]:
#!g1.1
