In [None]:
from __future__ import print_function, division
import sys
sys.path.append('/home/ego/Github/david/')

import os
from os.path import exists, join, isfile

import dataset
import pandas as pd

In [233]:
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
from dis import dis


class Substitution:
    """A decorator to take a function's docstring and perform string
    substitution on it. 
    This decorator should be robust even if func.__doc__ is None
    (for example, if -OO was passed to the interpreter)

    Usage: construct a docstring.Substitution with a sequence or
    dictionary suitable for performing substitution; then
    decorate a suitable function with the constructed object. e.g.

    >>> sub_author_name = Substitution(author='Jason')
    >>> ...
    >>> @sub_author_name
    >>> def some_function(x):
            "%(author)s wrote this function"

    # note that some_function.__doc__ is now "Jason wrote this function"
    One can also use positional arguments.

    >>> sub_first_last_names = Substitution('Edgar Allen', 'Poe')

    >>> @sub_first_last_names
    >>> def some_function(x):
            "%s %s wrote the Raven"
    """
    def __init__(self, *args, **kwargs):
        if args and kwargs:
            raise AssertionError("Only positional or keyword args are allowed")
        # which ever (args | kwargs) is passed thorugh the condition.
        self.params = args or kwargs

    def __call__(self, func: Callable) -> Callable:
        func.__doc__ = func.__doc__ and func.__doc__ % self.params
        return func

    def update(self, *args, **kwargs) -> None:
        """Update self.params with supplied args.
        If called, we assume self.params is a dict.
        """
        self.params.update(*args, **kwargs)

In [234]:
sub_author_name = Substitution(author='Carlos', age=27, specs='Information Systems')
sub_author_name.params

{'author': 'Carlos', 'age': 27, 'specs': 'Information Systems'}

In [235]:
dis(Substitution)

Disassembly of __call__:
 37           0 LOAD_FAST                1 (func)
              2 LOAD_ATTR                0 (__doc__)
              4 JUMP_IF_FALSE_OR_POP    16
              6 LOAD_FAST                1 (func)
              8 LOAD_ATTR                0 (__doc__)
             10 LOAD_FAST                0 (self)
             12 LOAD_ATTR                1 (params)
             14 BINARY_MODULO
        >>   16 LOAD_FAST                1 (func)
             18 STORE_ATTR               0 (__doc__)

 38          20 LOAD_FAST                1 (func)
             22 RETURN_VALUE

Disassembly of __init__:
 31           0 LOAD_FAST                1 (args)
              2 POP_JUMP_IF_FALSE       16
              4 LOAD_FAST                2 (kwargs)
              6 POP_JUMP_IF_FALSE       16

 32           8 LOAD_GLOBAL              0 (AssertionError)
             10 LOAD_CONST               1 ('Only positional or keyword args are allowed')
             12 CALL_FUNCTION            1
  

In [None]:
@sub_author_name
def func(x):
    "%(author)s function"
func.__doc__

In [1]:
import numpy as np

data = {
    "col_{0:02d}".format(i): np.random.randint(0, high=1000, size=30000)
    for i in range(100)
}
data

{'col_00': array([521, 667, 262, ..., 348, 604, 130]),
 'col_01': array([339, 565, 369, ..., 752, 952, 131]),
 'col_02': array([925, 530, 519, ..., 896, 420, 634]),
 'col_03': array([838, 225, 547, ..., 747, 456, 347]),
 'col_04': array([386, 778, 747, ..., 971, 726, 930]),
 'col_05': array([ 20, 395, 361, ...,  32, 221, 529]),
 'col_06': array([372, 977, 409, ..., 125, 454, 358]),
 'col_07': array([551, 607, 553, ...,  92, 267, 728]),
 'col_08': array([154, 395, 513, ..., 739, 975, 966]),
 'col_09': array([729, 556, 217, ..., 704, 394,  51]),
 'col_10': array([158, 219, 329, ..., 557,  49, 972]),
 'col_11': array([876, 666, 468, ...,  60, 522, 237]),
 'col_12': array([688, 709, 978, ..., 481, 145, 899]),
 'col_13': array([958, 848, 129, ..., 144, 366, 868]),
 'col_14': array([941,  86, 113, ..., 968, 986,  83]),
 'col_15': array([581, 321, 444, ..., 323, 763, 222]),
 'col_16': array([770, 152, 184, ..., 627, 429,  76]),
 'col_17': array([358, 542, 256, ..., 511, 120,  30]),
 'col_18':

In [None]:
class DFClassGenerator:
    '''Example Usage.
    comment = CommentsFrame(data=dict(
        author=['carlos', 'chucho'],
        cid=[122, 177],
        text=['this is carlos', 'chucho here'],
        time=['10:30', '19:40']))
    type(comment)
    '''
    CLASS_HEADER = 'class {class_name}(pd.DataFrame):'
    COLUMNS = '    {var} = "{label}"'   # we cheat an encode 4 spaces here,for demo

    CONSTRUCTOR =  ("    @property\n"
                    "    def _constructor(self):\n"
                    "        return {class_name}")
    @classmethod
    def generate_class(cls, df, class_name):
        cols = [cls.COLUMNS.format(var=c.upper(), label=c)
                for c in df.columns] # works for single hierarchical column index
        lines = [cls.CLASS_HEADER.format(class_name=class_name)]
        constructor = cls.CONSTRUCTOR.format(class_name=class_name)
        source_code = '\n'.join(lines + cols) + '\n\n' + constructor
        print(source_code)    
#source_code = DFClassGenerator.generate_class(metric, 'CommentsFrame')
#print(source_code)

In [10]:
import pandas as pd

class DavidFrame(pd.DataFrame):
    def __init__(self, *args, **kwargs):
        super(DavidFrame, self).__init__(*args, **kwargs)
        self._metadata = {'test': 'TEST'}
        
    def to_textfile(self, fn, text_col='text'):
        with open(fn, 'w', encoding='utf-8') as f:
            for x in self[text_col].tolist():
                if len(x) != 0:
                    f.write('%s\n' % x)
            f.close()

    @property
    def _constructor(self):
        
        print("< _constructor > :: called")
        return DavidFrame

    def __finalize__(self, other, method=None, **kwargs):
        '''Propagate metadata from other to self
        '''

        print("< __finalize__ > :: called")
        for name in self._metadata:
            object.__setattr__(self, name, getattr(other, name, None))
        return self

    def clone_instance_obj(self):
        '''Clones a copy of the class instance object.
        Returns a copy of the object.
        '''
        data = self._data
        data = data.copy()
        return DavidFrame(data).__finalize__(self)

In [11]:
def builder(func, *pargs, **kwards):
    print(f'calling: {func.__name__}')
    return func(*pargs, **kwards)

In [13]:
metric = TextMetrics('downloads/4Dk3jOSbz_0.json')
david = DavidFrame(metric.to_dict(orient='dict'))

In [14]:
david_copy = david.copy()
david_copy

__finalize__ called
_constructor called
__finalize__ called
_constructor called
__finalize__ called
__finalize__ called
_constructor called
__finalize__ called
_constructor called
__finalize__ called
__finalize__ called


Unnamed: 0,author,cid,text,time
0,Kevin Collins,UgxTICnj2z9wfoG1IYl4AaABAg,2016 Election put business person in govermen...,2 months ago
1,Big AL,Ugz-mRikNSCJVUzjhNd4AaABAg,I was so proud of my country the land of the A...,3 months ago
2,Stephen Patrick,UgxiD8K6JRL3asYUVat4AaABAg,From some news blasts I've seen some of Donald...,3 months ago
3,Terry Clark,Ugw4z1qX__lmGTzkmpJ4AaABAg,Impeachment cannot come soon enough for the Li...,3 months ago (edited)
4,Adeline Yee,UgwT_9FSnmq4neWV6KJ4AaABAg,What has become of America the great! You have...,3 months ago
5,Zappa Woman,Ugy3d1aEab9lzQO5ohR4AaABAg,This reminds me of Animal Farm and how the rul...,3 months ago
6,ShadowFoxSF,UgyqXuY2YMlW3Psl8nV4AaABAg,Lies on top of obstruction on top of lies...\n...,3 months ago
7,Andy C,Ugy3wTkH6AeM22tafWR4AaABAg,Someone needs to make a meme with Sarah's face...,3 months ago
8,Lorenzo,UgyJR0ptpaeIZK8fIaN4AaABAg,If some people want something it does not mean...,3 months ago
9,Don't SSleep,UgxTxzTPRBSFOflSGat4AaABAg,"David, you're a bit left of where I'm at, but ...",3 months ago


In [18]:
import pandas as pd

class DavidDataFrameBase(pd.DataFrame):
    RECIPES = None
    SESSIONS = None
    def __init__(self, data_structure):
        super(DavidDataFrameBase, self).__init__(data_structure)
        
class JsonDataFrame(DavidDataFrameBase):
    CORPUS_PATH = None
    def __init__(self, corpus_path):
        super(JsonDataFrame, self).__init__(pd.read_json(
            corpus_path, encoding='utf-8', lines=True))
        self.CORPUS_PATH = corpus_path

    def get_corpus_path(self):
        return self.CORPUS_PATH
    
class TextMetrics(JsonDataFrame):
    SENTI_LABELS = ('positive', 'negative', 'neutral')
    if not isinstance(SENTI_LABELS, tuple):
        raise TypeError('you need to pass a tuple!')

    def __init__(self, corpus_path: str):
        super().__init__(corpus_path)
        
    def get_labels(self):
        return self.SENTI_LABELS

In [19]:
tm = TextMetrics('downloads/BmYZH7xt8sU.json')
tm

Unnamed: 0,author,cid,text,time
0,PNW Ryan,UgzaG3oJa98fF6qi32h4AaABAg,I always forget to like the vid :( remind us!!!:P,25 minutes ago
1,Brian McDonald,Ugwgnpcp2e1D3mWMX2p4AaABAg,Looked like there was oil coming down left sid...,2 hours ago
2,Lewis Harvey,UgzpJXjwsyxTe1CLe4R4AaABAg,i like videos if its something really really f...,5 hours ago
3,skip rose,UgwBlsLv64UrNOcONz54AaABAg,Pos cable to small of guage causing currant su...,12 hours ago
4,Price Check On VagiClean,UgzjoHUlBJ8PDM_I3Gx4AaABAg,GT3RS: One of the best track cars ever made. Y...,12 hours ago
5,KgreProductions,UgwuiJw9Ib_5cmuV2sV4AaABAg,I try and like every video I watch but sometim...,12 hours ago
6,Vaughnny McGuire,Ugwr2AqW7W25MSimF5h4AaABAg,I literally like every video right after I cli...,18 hours ago
7,Gavin Banton,UgyENK3lNOcDbfNpMRl4AaABAg,I like for the mustache Adam gotta love it 😂😂,20 hours ago
8,Mike Scott,UgyCYyRJEIcU6NNK7rx4AaABAg,I like videos of content creators I truly supp...,1 day ago
9,rdub202,UgwB0Lo4qgBOw83utpJ4AaABAg,There are certain channels I follow that get a...,1 day ago


In [20]:
# from TextMetrics class
tm.get_labels()

('positive', 'negative', 'neutral')

In [159]:
import jsonlines
from pandas import DataFrame
from pandas import RangeIndex

CommentColumns = ['author', 'cid', 'text']

def load_jsonline(file_path: str):
    # loads jsonline files and returns a optimized list.
    with jsonlines.open(file_path) as reader:
        dumplist = []
        for obj in reader:
            dumplist.append([
                obj['author'],
                obj['cid'],
                obj['text']])
        return dumplist

class CommentsDataFrameBase(DataFrame):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

In [160]:
jsonfile = load_jsonline('downloads/BmYZH7xt8sU.json')
dataset = CommentsDataFrameBase(jsonfile, columns=CommentColumns)
dataset.sample(5)

Unnamed: 0,author,cid,text
827,YzFDyL,UgxhLKgBTdZQm1Stcuh4AaABAg,Did you sell your house?
1152,KYLOW RBAUTO,Ugx-rrPOraaFrVhYeTJ4AaABAg,Love the content beats most uk car content cre...
3870,Rampant Racing,UgxzfYreYEiJUpbY_nB4AaABAg,I hit like when I go holy shit that’s awesome
951,SN95 Mustang Garage,UgzjuCUU1I45UIIwKmZ4AaABAg,"I’ve noticed the videos are the same, blown mo..."
2686,Peter Palmer,UgzFIWtFAJCnWoyNRsB4AaABAg,"I usually never comment, but I usually like vi..."


In [162]:
jsonfile[2]

['Lewis Harvey',
 'UgzpJXjwsyxTe1CLe4R4AaABAg',
 'i like videos if its something really really fuckin cool']

In [163]:
dataset.shape

(4252, 3)

In [164]:
dataset.dropna(axis=0, how='any', inplace=True)
dataset.shape

(4252, 3)

In [165]:
dataset.index = RangeIndex(len(dataset.index))
dataset.head()

Unnamed: 0,author,cid,text
0,PNW Ryan,UgzaG3oJa98fF6qi32h4AaABAg,I always forget to like the vid :( remind us!!!:P
1,Brian McDonald,Ugwgnpcp2e1D3mWMX2p4AaABAg,Looked like there was oil coming down left sid...
2,Lewis Harvey,UgzpJXjwsyxTe1CLe4R4AaABAg,i like videos if its something really really f...
3,skip rose,UgwBlsLv64UrNOcONz54AaABAg,Pos cable to small of guage causing currant su...
4,Price Check On VagiClean,UgzjoHUlBJ8PDM_I3Gx4AaABAg,GT3RS: One of the best track cars ever made. Y...


In [253]:
class DavidPandasBase(object):
    Dataset = None

    def to_textfile(self, fn: str, text_col='text'):
        with open(fn, 'w', encoding='utf-8') as f:
            for x in self[text_col].tolist():
                if len(x) != 0:
                    f.write('%s\n' % x)
            f.close()

    @property
    def obj_to_dict(self):
        return self.to_dict(orient='index')
    
    @property
    def missing_values(self):
        return self.isnull().sum()

    def normalize_whitespaces(self, text_col='text'):
        self[text_col] = self[text_col].str.strip()

    def slice_dataframe(self, by_minvalue=0, of_col='stringLength'):
        if (by_minvalue == 0):
            raise Exception('You must pass a value greater than zero!')
        return self[self[of_col] > int(by_minvalue)]
    
    
class DavidDataFrame(DavidPandasBase, DataFrame):
    def __init__(self, *args, **kwargs):
        super(DavidDataFrame, self).__init__(*args, **kwargs)
        self.all_columns = self.index

In [254]:
dataset = load_jsonline('downloads/BmYZH7xt8sU.json')
david = DavidDataFrame(dataset, columns=CommentColumns)
david.sample(5)

Unnamed: 0,author,cid,text
2336,Tom Filsell,UgyWp1L6Ps3DyVQYL3Z4AaABAg,I don't like videos because I don't like that ...
3298,ziyaad Esau,Ugz93jNo-dIB8j9SDQh4AaABAg,I just like every video especially ones with T...
220,Gartz Films,UgwauT-BRYWrMNj9v1h4AaABAg,Ive never really liked a lot of videos and I t...
498,Robert Grootz,Ugxky1sq6wqANGDKY4N4AaABAg,I like videos that make me wanna be a part of ...
2133,That_person,Ugz0IuXq0rzLNcvTBIB4AaABAg,I use the like button when I see content of yo...


In [255]:
david.missing_values

author    0
cid       0
text      0
dtype: int64

In [262]:
david_obj = david.obj_to_dict
david_obj[5]

{'author': 'KgreProductions',
 'cid': 'UgwuiJw9Ib_5cmuV2sV4AaABAg',
 'text': 'I try and like every video I watch but sometimes I forget... remind us plz'}

In [263]:
david_obj[5]['text']

'I try and like every video I watch but sometimes I forget... remind us plz'

In [307]:
david.index

RangeIndex(start=0, stop=4252, step=1)

In [186]:
DEFAULT_GEO_COLUMN_NAME = 'geometry'

class GeoPandasBase(object):
    Index = None
    IndexGenerated = False

    def invalid_dates_index(self):
        # Indicates that the spatial index should
        # be re-built next time it's requested.
        self.Index = None
        self.IndexGenerated = False

class GeoDataFrame(GeoPandasBase, DataFrame):
    InternalNames = [
        '_data', '_cacher', '_item_cache',
        '_cache', 'is_copy', '_subtyp', '_index',
        '_default_kind', '_default_fill_value',
        'Metadata', '__array_struct__', '__array_interface__'
    ]

    Metadata = ['crs', 'GeometryColumnName']
    GeometryColumnName = DEFAULT_GEO_COLUMN_NAME

    def __init__(self, *args, **kwargs):
        crs = kwargs.pop('crs', None)
        geometry = kwargs.pop('geometry', None)
        
        super(GeoDataFrame, self).__init__(*args, **kwargs)
        
        self.crs = crs
        if geometry is not None:
            self.set_geometry(geometry, inplace=True)
        # calls the class GeoPandasBase method and resets default values.
        self.invalid_dates_index()

In [187]:
geo = GeoDataFrame(geometry=None)

In [188]:
geo.__getstate__

<bound method GeoDataFrame.__getstate__ of Empty GeoDataFrame
Columns: []
Index: []>

In [189]:
geo.invalid_dates_index()

In [195]:
geo.sIndex = False # default None
geo.sIndexGenerated = True # default False

In [196]:
geo.sIndexGenerated

True

In [197]:
geo.sIndex

False

In [274]:
def arguments(*args, **kwargs):
    crs = kwargs.pop('crs', None)
    geometry = kwargs.pop('geometry', None)
    if args or kwargs:
        final = crs or geometry
    print(final)

In [290]:
kw = dict(geometry=None, geo=True, circle=3.14)
kw

{'geometry': None, 'geo': True, 'circle': 3.14}

In [292]:
kw.pop('geometry', None)

In [293]:
kw

{'geo': True, 'circle': 3.14}

In [294]:
Metadata = ['david', 'pandasDF']
meta = dict((k, getattr(k, None)) for k in Metadata)
meta

TypeError: getattr(): attribute name must be string

In [200]:
f = Frob()
f.bamf = "bamf"
f.bamf

['crs', 'GeometryColumnName']

In [201]:
class Frob:
    def __setattr__(self, name, value):
        # use the __setattr__ to dynamically
        # change the attribute value to uppercase
        self.__dict__[name] = value.upper()

In [202]:
f = Frob()
f

<__main__.Frob at 0x7f32b47b6710>

In [203]:
f.elem_a = 'elem_a'
f.elem_a

'ELEM_A'

In [206]:
class Hoo(object):
    
  def __init__(self, val=2):
     self.val = val
        
  def __getstate__(self):
     print("I'm being pickled")
     self.val *= 2
     return self.__dict__
    
  def __setstate__(self, d):
     print("I'm being unpickled with these values:", d)
     self.__dict__ = d
     self.val *= 3

In [207]:
h = Hoo()

In [209]:
h.val * 9

18

In [210]:
hval = h.val * 2
hval

4

In [219]:
class Celsius:
    def __init__(self, temperature=0):
        self.temperature = temperature

    def to_fahrenheit(self):
        return (self.temperature * 1.8) + 32

    def get_temperature(self):
        print("Getting value")
        return self._temperature

    def set_temperature(self, value):
        if value < -273:
            raise ValueError("Temperature below -273 is not possible")
        print("Setting value")
        self._temperature = value

    temperature = property(get_temperature, set_temperature)

In [220]:
c = Celsius()

Setting value


In [227]:
c.temperature

Getting value


0

In [228]:
c.temperature = 37

Setting value


In [229]:
c.to_fahrenheit()

Getting value


98.60000000000001

In [230]:
c._temperature

37