In [1]:
import os
from os import environ
from os import path, makedirs
from os.path import join, exists
from typing import List, Sequence, Collection, Mapping, AnyStr, Iterable
from functools import namedtuple

import dataset
import records

In [2]:
Uniques = {
    'videos': 'SELECT DISTINCT video_id FROM comments;',
    'comments': 'SELECT video_id, COUNT(*) c FROM comments GROUP BY video_id;',}

condition = {
    'rm_condition': ' {condition}',
    'is_row': ' is {row}',
    'between_ab': ' between {row_a} and {row_b}',}

SimilarTextsInColumn = ("SELECT text FROM {table} "
                        "WHERE {column} {condition} "
                        "AND text LIKE '{text}'")

In [3]:
def pointer(name: str, params: dict):
    name = namedtuple(name, params.keys())
    return name(*params.values())

def prep_search(text: str, joinby='_'):
    '''Cleans a given sequence and returns a formatted pattern.
    '''
    text = ' '.join(text.split()).replace(' ', joinby)
    return (f'%{text}%')

def format_condition(template: str,
                     clause: list = None,
                     condition=condition):
    '''Sql statement template formatter.

    Parameters:
    ----------

    `template`: (type=str)
        A sql statement-like pattern to for a clause
        condition to use.

    `clause` : (types=List[str|int] or [NoneType])
        A clause can be a sigle or pair-combinations
        or None. For example, passing a clause=[1,10]
        returns a formatted str: 'BETWEEN 1 AND 100'.
    '''
    Condition = pointer('ConditionClause', condition)
    if not clause:
        return template.replace(Condition.rm_condition, '')
    else:
        if len(clause) == 1:
            con = Condition.is_row.format(row=clause[0])
        if len(clause) == 2:
            con = Condition.between_ab.format(row_a=clause[0],
                                              row_b=clause[1])
        return template.replace(Condition.rm_condition, con)

def get_similartexts(table: str,
                     column: str,
                     text: Sequence[str],
                     clause: List[str] = None
                     ) -> Sequence[Mapping]:
    '''
    Returns a searchable sequence mapping.
    '''
    if clause and not isinstance(clause, list):
        clause = [clause]
    similar = format_condition(SimilarTextsInColumn, clause=clause)
    return similar.format(table=table, column=column, text=prep_search(text))

In [8]:
class SqlCommentsServer(object):
    '''ZNQ sqlite server that adds/fetches
    transactions to a sqlite database.
    '''
    qunique = pointer('UniqueQueryStatements', Uniques)

    def __init__(self, sqlpath=None, table_name='comments', bind_addr='tcp://127.0.0.1:5000'):
        self._sqlpath = sqlpath
        self._table_name = table_name
        self._bind_addr = bind_addr

        if sqlpath is None:
            self._sqlpath = 'sqlite:///{}'.format(
                environ.get('DAVID_COMMENTS_DB'))

        self._records = records.Database(self._sqlpath)
        self.db = dataset.connect(self._sqlpath)
        self.tables = self.db.tables
        self.columns = self.db[self._table_name].columns
        self.num_rows = len(self.db[self._table_name])

    def to_textfile(self, fname: str, docs: list, dirpath='output'):
        if not exists(dirpath): makedirs(dirpath)
        with open(join(dirpath, fname), 'w', encoding='utf-8') as f:
            for doc in docs:
                f.write('%s\n' % doc)
            f.close()

    def print_unique_videoids(self):
        for i in self.db.query(self.qunique.videos):
            print(i['video_id'])

    def print_comments_by_videoid(self):
        for num, i in enumerate(self.db.query(self.qunique.comments)):
            print(f"{num}: {i['video_id']} -> {i['c']}")
        print(f'\ntotal comments in database: {self.num_rows}')

    def similar_texts(self,
                      text: Sequence[str],
                      clause: List[str] = None,
                      as_list: bool = False,
                      column: AnyStr = 'id') -> Iterable[Collection]:
        '''
        Get a document containing texts matching
        a given sequence.

        NOTE: The default table is set to 'comments'.

        Parameters:
        ----------

        `text`: (type=str)
            A sring sequence or key-words to use for extracting
            similar sequences from any maching row.

        '''
        similar_docs = self._records.query(get_similartexts(
            table=self._table_name, column=column,
            text=text, clause=clause
        ))
        if as_list:
            return [' '.join(doc.text.split()) for doc in similar_docs]
        else:
            return similar_docs
                
    def __repr__(self):
        return ("SqlCommentsServer(\n"
                f" tables : {self.tables}\n"
                f" total columns : {self.columns}\n"
                f" total comments : {self.num_rows}\n"
                ")")

In [9]:
local_url = 'sqlite:///yt_comments.db'
sql = SqlCommentsServer(sqlpath=local_url)
sql

SqlCommentsServer(
 tables : ['comments']
 total columns : ['id', 'cid', 'text', 'time', 'author', 'video_id']
 total comments : 252848
)

In [10]:
docs = sql.similar_texts('make %a% %video', clause=[1, sql.num_rows])

f'number of similar documents: {len(list(docs))}\n'
for doc in docs: print(' '.join(doc.text.split()))

number of similar documents: 323

Finally make it work in less than 40 min, thanks for a great video of introduction to Tensorflow
It's not suitable for beginner. I suggest you to make a video to teach step by step.
Its too fast to understand everything. Can you please make a video on Tensorflow in detail.
What a looser , Makes it seem easy in video description , and then Siraj talks ancient language!
Make a video on CNNs, multi-parametric tests and how to generate a lip-synced video like these famous ones where they impersonate Obama
Love you siraj rival ........ Really I was very beginner ..... Your video really make me a master for machine learning you are really great..... and your videos are awesome ...... No one can beat you ....... My best wishes for your channel and upcoming videos#LoveSirajRival
Sir pls make a video on telework
Hey Siraj, nice video! Could you by any chance make a video for installingTensorflow on windows? I've been having trouble with the pip installation, it

In [13]:
# One way of joining components is to nest the following instance
# methods and write out the output from similar_texts  to a file
# by setting the parameter: as_list=True.

sql.to_textfile(fname='make_a_video.txt', docs=sql.similar_texts(
    text='make %a% %video', clause=[1, sql.num_rows], as_list=True))

# spaCy

> TODO: write a function that parses text sentences. As some documents contain long paragraphs.

### Helpers

* **paths** :

    - `Jupyter-Notebooks/Natural-Language-Processing/Elmo/elmo_contextual_embeddings.ipynb`
    
    - `Jupyter-Notebooks/David/02_pipe/chunking-%26-testing-spacy-language-detection.ipynb`

    - `Jupyter-Notebooks/David/01_pipe/pipe.ipynb`
    
    - `Jupyter-Notebooks/vuepoint_brainstorm/Fast-text-queries-using-SQL.ipynb`

In [None]:
doc = nlp(text)

sentences = []
for sent in doc.sents:
    if len(sent) > 1:
        sentences.append(sent.string.strip())
return sentences