## Scrape episode transcripts from the wiki page

In [1]:
import requests
from bs4 import BeautifulSoup

url = 'http://rickandmorty.wikia.com/wiki/List_of_episodes'

def get_links(url):
    links = []
    resp = requests.get(url)
    if resp.status_code != 200:
        return None
    soup = BeautifulSoup(resp.text, "lxml")
    for table in soup.find_all('table'):
        for link in table.find_all('a',href=True):
            if link.has_attr('title'):
                links.append('http://rickandmorty.wikia.com' + link['href'])
    return links
    
link = get_links(url)

## Extract episode transcripts using links

In [2]:
import re

def get_transcript(url, character):
    char_lines = []
    
    url_new = url + '/Transcript'
    resp = requests.get(url_new)
    if resp.status_code != 200:
        return []
    soup = BeautifulSoup(resp.text, "lxml")
    if soup.find('p'):
        script = soup.find('p').text
    else:
        return []
    lines = script.split('\n')
    
    for i, line in enumerate(lines):
        pattern = re.compile('^'+character)
        if re.match(pattern, line):
            char_lines.append(line.replace(u'\xa0', ' '))
    return char_lines

transcript = get_transcript(link[0], 'Rick')

In [3]:
def compile_transcripts(character):
    quote_list = []
    url = 'http://rickandmorty.wikia.com/wiki/List_of_episodes'
    links = list(set(get_links(url)))
    for link in links:
        quote_list.extend(get_transcript(link, character))
    return quote_list

quotes = compile_transcripts('Rick')

## Parsing

In [53]:
def parse_quotes(quotes):
    new_quotes = []
    for quote in quotes:
        #change everything to lowercase and remove "<character>: " from beginning
        n_txt = ': '.join(quote.split(': ')[1:])#.lower()
        
        #remove text between asterisks (corresponds to actions, not quotes)
        n_txt2 = ''
        flip = True
        for char in n_txt:
            if char == '*':
                flip = not flip
            if flip:
                n_txt2 += char
        
        n_txt2 = n_txt2.replace('…','').replace('*', '').replace('\n','')
        #strip newlines and asterisks from quote
        new_quotes.append(n_txt2)
    return new_quotes

n_quotes = parse_quotes(quotes)

In [79]:
quotes = compile_transcripts('')
n_quotes = parse_quotes(quotes)

In [80]:
#total words for that character
sum([len(n.split(' ')) for n in n_quotes])

12587

## Building a Markov Chain

In [87]:
import markovify

text_model = markovify.NewlineText('\n'.join(n_quotes),state_size=1)

for i in range(4):
    #tweet = text_model.make_sentence()
    tweet = text_model.make_short_sentence(140)
    tweet = tweet[0].upper() + tweet[1:]
    print(tweet)

Okay, I'll talk about your family, you blame him? Come on, Terry, there is that song!
What, I'm a new experiences.
Whoa, whoa, whoa! What's wrong?
No. D-Do we have you. Those are you, too, very --


In [57]:
#Save lines to text for later usage

with open('morty.txt', 'w') as f:
    for line in n_quotes:
        f.write(line + '\n')