# Initialisation

In [1]:
import ibis

uri = "postgresql+psycopg2://postgres:postgres@db.postgres.app.com/postgres"

In [2]:
con = ibis.postgres.connect(url=uri)

In [3]:
con.list_tables()

['accounts', 'country', 'person']

In [4]:
ibis.options

{'bigquery': {'partition_col': 'PARTITIONTIME'},
 'clickhouse': {'temp_db': '__ibis_tmp'},
 'default_backend': None,
 'graphviz_repr': True,
 'impala': {'temp_db': '__ibis_tmp', 'temp_hdfs_path': '/tmp/ibis'},
 'interactive': False,
 'sql': {'default_limit': 10000},
 'verbose': False,
 'verbose_log': None}

In [5]:
ibis.options.interactive = True

In [6]:
con.table("accounts").names?

Object `names` not found.


In [None]:
con.table("accounts").names

In [7]:
import pandas as pd
import numpy as np

In [9]:
from pandas.api.extensions import (register_dataframe_accessor,
                                   register_index_accessor,
                                   register_series_accessor,
                                   register_extension_dtype)

@register_dataframe_accessor("geo")
class GeoAccessor(object):
    def __init__(self, pandas_obj):
        self._obj = pandas_obj

    @property
    def center(self):
        # return the geographic center point of this DataFrame
        lat = self._obj.latitude
        lon = self._obj.longitude
        return (float(lon.mean()), float(lat.mean()))

    def plot(self):
        # plot this array's data on a map, e.g., using Cartopy
        pass
    
    
@register_index_accessor("blabla")
class BlaBla(object):
    def __init__(self, pandas_obj):
        self._obj = pandas_obj

    @property
    def center(self):
        # return the geographic center point of this DataFrame
         return 10

    def plot(self):
        # plot this array's data on a map, e.g., using Cartopy
        pass

In [10]:
ds = pd.DataFrame({'longitude': np.linspace(0, 10),
                   'latitude': np.linspace(0, 20)})

In [17]:
ds.longitude.

[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x7f66e53cb098>
[0;31mDocstring:[0m  
Return Series as ndarray or ndarray-like depending on the dtype.


   We recommend using :attr:`Series.array` or
   :meth:`Series.to_numpy`, depending on whether you need
   a reference to the underlying data or a NumPy array.

Returns
-------
arr : numpy.ndarray or ndarray-like

See Also
--------
Series.array : Reference to the underlying data.
Series.to_numpy : A NumPy array representing the underlying data.

Examples
--------
>>> pd.Series([1, 2, 3]).values
array([1, 2, 3])

>>> pd.Series(list('aabc')).values
array(['a', 'a', 'b', 'c'], dtype=object)

>>> pd.Series(list('aabc')).astype('category').values
[a, a, b, c]
Categories (3, object): [a, b, c]

Timezone aware datetime data is converted to UTC:

>>> pd.Series(pd.date_range('20130101', periods=3,
...                         tz='US/Eastern')).values
array(['2013-01-01T05:00:00.000000000',
       '2013-01-02T05:00:

In [11]:
ds.geo.center

(5.0, 10.0)

In [12]:
ds.index.blabla.center

10

In [None]:
from pandas import DataFrame
from pandas.api.extensions import (reg)

In [None]:
import intake
cat = intake.open_csv("../data/accounts/*.csv", 
                      csv_kwargs={
                          "dtype": {
                              'id': np.int8}})

In [41]:
df = cat.to_dask().compute()

In [42]:
df.memory_usage()

Index     24000000
id         3000000
names     24000000
amount    24000000
dtype: int64

In [28]:
gp = df.groupby('id')

In [31]:
gp.agg(lambda x: len(np.unique(x))).names.max()

2

In [25]:
newIndex = np.arange(df.shape[0])

In [26]:
newIndex.shape

(3000000,)

In [33]:
df.reindex(axis=0, method="pad").tail()

Unnamed: 0,id,names,amount
999995,252,Michael,2157
999996,25,Edith,121
999997,229,Ursula,218
999998,372,Patricia,220
999999,443,Charlie,2321


In [None]:
# %timeit df.names.map(lambda x: len(x)).max()

In [78]:
np.dtype('|S8')

dtype('S8')

In [35]:
dfp = pd.DataFrame(df.values, columns=df.columns)

In [34]:
# dfp.names = dfp.names.astype(np.dtype('|S8'))
dfp.id = dfp.id.astype(np.int32)

In [36]:
dfp.dtypes

id        object
names     object
amount    object
dtype: object

In [89]:
dfp.memory_usage()

Index           80
id        12000000
names     24000000
amount    24000000
dtype: int64

In [43]:
from sqlalchemy import create_engine

df.to_sql("accounts",
          con=create_engine(uri).connect())

ValueError: Table 'accounts' already exists.

In [91]:
%timeit dfp.names.map(lambda x: len(x)).max()

1.16 s ± 8.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [21]:
pd.np

numpy.dtype

In [5]:
ibis.schema?

[0;31mSignature:[0m [0mibis[0m[0;34m.[0m[0mschema[0m[0;34m([0m[0mpairs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mnames[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mtypes[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate and return an Ibis Schema object

Ibis uses its own type aliases that map onto database types. See, for
example, the correspondence between Ibis type names and Impala type names:

Ibis type      Impala Type
~~~~~~~~~      ~~~~~~~~~~~
int8           TINYINT
int16          SMALLINT
int32          INT
int64          BIGINT
float          FLOAT
double         DOUBLE
boolean        BOOLEAN
string         STRING
timestamp      TIMESTAMP
decimal(p, s)  DECIMAL(p,s)
interval(u)    INTERVAL(u)

Parameters
----------
pairs : list of (name, type) tuples
  Mutually exclusive with names/types
names : list of string
  Field names
types : list of string
  Field types

Examples
--------
>>> from ibis import schema
>>> 

In [86]:
accounts = con.table('accounts')

In [89]:
accounts.names?

[0;31mCall signature:[0m  [0maccounts[0m[0;34m.[0m[0mnames[0m[0;34m([0m[0mf[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m            StringColumn
[0;31mString form:[0m    
ref_0
PostgreSQLTable[table]
  name: accounts
  schema:
    index : int64
    id : int32
    names : string
    amount : int64

names = Column[string*] 'names' from table
  ref_0
[0;31mFile:[0m            /opt/conda/lib/python3.7/site-packages/ibis/expr/types.py
[0;31mDocstring:[0m       <no docstring>
[0;31mClass docstring:[0m
Base class for a data generating expression having a fixed and known type,
either a single value (scalar)
[0;31mCall docstring:[0m 
Generic composition function to enable expression pipelining.

Parameters
----------
f : function or (function, arg_name) tuple
  If the expression needs to be passed as anything other than the first
  argument to the function, pass a tuple with the argum

In [15]:
print(ibis.postgres.compile(person.person_first))

SELECT t0.person_first 
FROM person AS t0


# Type

Ibis uses its own type aliases that map onto database types. See, for
example, the correspondence between Ibis type names and Impala type names:

```
Ibis type      Impala Type
~~~~~~~~~      ~~~~~~~~~~~
int8           TINYINT
int16          SMALLINT
int32          INT
int64          BIGINT
float          FLOAT
double         DOUBLE
boolean        BOOLEAN
string         STRING
timestamp      TIMESTAMP
decimal(p, s)  DECIMAL(p,s)
interval(u)    INTERVAL(u)
```