In [1]:
from flask import Flask, render_template, jsonify, request
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import os

import html
import urllib.request
import re
import spacy

In [2]:
with urllib.request.urlopen('https://gist.githubusercontent.com/erickedji/68802/raw/7264f2d232702b4013490a0b2f9286cfa1b817e3/quotes.txt') as f:
    lines = [s.decode('utf-8')  for s in f.readlines()]

In [36]:
with open("raw.txt", "r") as f:
    text = f.readlines()

['"Do-so" is more important than "say-so". â€” Pete Seeger\n',
 '"Dreamt" is the only English word that ends in the letters "mt".\n',
 '"For example" is not proof. â€” Yiddish proverb\n',
 '"I really don\'t believe that, but I was raised that way." â€” Carolyn Myss\n',
 '"Keep the tourists out..." As fellow tourists we heartily agree. â€” Edward Abbey\n',
 '"Know thyself"? If I knew myself, I\'d run away. â€” Johann von Goethe\n',
 '"My country right or wrong" is like saying, "My mother drunk or sober."\n',
 '"Things" are thieves of time.\n',
 '...context... â€” Manny Farber\n',
 '1. Thought is creative. 2. Fear attracts like energy. 3. Love is all there is.\n',
 'A "jiffy" is an actual unit of time for 1/100th of a second.\n',
 "A baby is God's opinion that the world should go on. â€” Carl Sandburg\n",
 'A beggar can never be bankrupt. â€” John Ray\n',
 'A billion hours ago man had not yet walked on earth.\n',
 'A bird in the hand is worth two in the bush.\n',
 'A boss says, "Go!" A l

In [37]:
quotes = []
for q in text:
    parts = q.split("â€")
    if len(parts) == 2:
        quotes.append(parts[0].strip())
    

In [38]:
quotes_clean = []
for q in quotes:
    if q[0] == '"':
        q = q[1:]
    if q[-1] == '"':
        q =q[:-1]
    quotes_clean.append(q)

In [39]:
len(quotes_clean)

3055

In [40]:
quotes_clean

['Do-so" is more important than "say-so".',
 'For example" is not proof.',
 "I really don't believe that, but I was raised that way.",
 'Keep the tourists out..." As fellow tourists we heartily agree.',
 'Know thyself"? If I knew myself, I\'d run away.',
 '...context...',
 "A baby is God's opinion that the world should go on.",
 'A beggar can never be bankrupt.',
 'A boss says, "Go!" A leader says, "Let\'s go!',
 'A boy gets to be a man when a man is needed.',
 "A business without a path to profit isn't a business, it's a hobby.",
 'A city is a large community where people are lonesome together.',
 'A civilized society makes swords into decoration.',
 'A collection of facts not necessarily science.',
 'A compliment is something like a kiss through a veil.',
 'A confusion of the real with the ideal never goes unpunished.',
 'A continuing flow of paper is sufficient to continue the flow of paper.',
 "A critic is a man who knows the way but can't drive the car.",
 'A critic is to an autho

In [3]:
html_clean = " ".join(html.split("\n"))
paragraphs = re.findall(r'<p>(.*?)</p>', html_clean)
corpus = []
for p in paragraphs:
    if "—" not in p:
        quote = "'".join(p.split("&rsquo;"))
        corpus.append(quote)

In [120]:
with urllib.request.urlopen('https://www.gutenberg.org/files/11/11-h/11-h.htm') as f:
    htm = f.read().decode('utf-8')
htm_clean = " ".join(htm.split("\n"))
paragraphs = re.findall(r'<p>(.*?)</p>', htm_clean)

In [121]:
pars = []
for p in paragraphs:
    pars.append(html.unescape(p.replace("\r","").strip()))
text = " ".join(pars)

In [122]:
text = text.replace("<i>","")
text = text.replace("</i>", "")

In [123]:
nlp = spacy.load("en_core_web_sm")

In [124]:
from spacy.pipeline import Sentencizer

senticer = Sentencizer(punct_chars=["."])
nlp.add_pipe(senticer, before = "parser")

doc = nlp(text)

In [104]:
def set_custom_boundaries(doc):
    for i, token in enumerate(doc):
          
        if token.text in ("!"):
            doc[i].is_sent_end = False
            
        if token.text in ("’s", "'s"):
            doc[i].is_sent_start = False
        elif token.text in ("“", "‘") and i < len(doc) - 1:
            # opening quote
            doc[i+1].is_sent_start = False
        elif token.text in ("”", "’"):
            # closing quote
            doc[i].is_sent_start = False
    return doc

In [125]:
corpus = [s.text.strip() for s in doc.sents]

In [41]:
corpus = quotes_clean

In [42]:
with open("corpus.txt", "w") as f:
    for p in corpus:
        f.writelines(p + "\n")

In [45]:
with open("corpus.txt", "r") as f:
        corpus= [s.strip() for s in f.readlines()]

In [46]:
corpus

['Do-so" is more important than "say-so".',
 'For example" is not proof.',
 "I really don't believe that, but I was raised that way.",
 'Keep the tourists out..." As fellow tourists we heartily agree.',
 'Know thyself"? If I knew myself, I\'d run away.',
 '...context...',
 "A baby is God's opinion that the world should go on.",
 'A beggar can never be bankrupt.',
 'A boss says, "Go!" A leader says, "Let\'s go!',
 'A boy gets to be a man when a man is needed.',
 "A business without a path to profit isn't a business, it's a hobby.",
 'A city is a large community where people are lonesome together.',
 'A civilized society makes swords into decoration.',
 'A collection of facts not necessarily science.',
 'A compliment is something like a kiss through a veil.',
 'A confusion of the real with the ideal never goes unpunished.',
 'A continuing flow of paper is sufficient to continue the flow of paper.',
 "A critic is a man who knows the way but can't drive the car.",
 'A critic is to an autho

In [47]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus).todense()

In [48]:
def similCompute(X, y):
    simil = []
    for i in range(X.shape[0]):
        row = X[i, :]
        if np.linalg.norm(y) == 0:
            return np.random.randint(X.shape[0])
        sim = np.dot(row, y.T)/(np.linalg.norm(row) * np.linalg.norm(y))
        simil.append(sim)
    return np.argmax(simil)
    

In [49]:
text ="I lost my happyness"
y = vectorizer.transform([text]).todense()
print(np.linalg.norm(y))
idx = similCompute(X, y)
quote = corpus[idx]
print(quote)
X.shape

1.0
He who hesitates is lost.


(3055, 4372)

In [6]:
app = Flask(__name__)

In [7]:
@app.route('/')
def index():
    return render_template("index.html")

@app.route('/compute', methods=['POST'])
def parse_text():
    text = request.form.get("text")
    if len(text) > 0:
        y = vectorizer.transform([text]).todense()
        idx = similCompute(X, y)
        quote = corpus[idx]
    else:
        quote = ""
    obj = [{"quote": quote}]
    return jsonify(obj)

In [8]:
port = int(os.environ.get('PORT', 5000))
if __name__ == "__main__":
    #app.run(host='0.0.0.0', port = port)
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on


 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [11]:
import sklearn

In [12]:
sklearn.__version__

'0.23.1'