In [1]:
import sqltables

In [2]:
# https://stackoverflow.com/questions/64793902/python-merge-several-columns-of-a-dataframe-without-having-duplicates-of-data
Name = ['Lolo', 'Mike', 'Tobias','Luke','Sam']
Age = [19, 34, 13, 45, 52]
Info_1 = ['Tall', 'Large', 'Small', 'Small','']
Info_2 = ['New York', 'Paris', 'Lisbon', '', 'Berlin']
Info_3 = ['Tall', 'Paris', 'Hi', 'Small', 'Thanks']
Data = [123,268,76,909,87]
Sex = ['F', 'M', 'M','M','M']

columns = {'Name' : Name, 'Age' : Age, 'Info_1' : Info_1, 'Info_2' : Info_2, 'Info_3' : Info_3, 'Data' : Data, 'Sex' : Sex}

In [3]:
db = sqltables.Database()
tab = db.load_values(zip(*columns.values()), column_names=columns.keys())
tab

|Name|Age|Info\_1|Info\_2|Info\_3|Data|Sex|
|-|-|-|-|-|-|-|
|\'Lolo\'|19|\'Tall\'|\'New York\'|\'Tall\'|123|\'F\'|
|\'Mike\'|34|\'Large\'|\'Paris\'|\'Paris\'|268|\'M\'|
|\'Tobias\'|13|\'Small\'|\'Lisbon\'|\'Hi\'|76|\'M\'|
|\'Luke\'|45|\'Small\'|\'\'|\'Small\'|909|\'M\'|
|\'Sam\'|52|\'\'|\'Berlin\'|\'Thanks\'|87|\'M\'|


In [4]:
info_tab = tab \
    .view("select Name, Info_1 as Info from _ union select Name, Info_2 as Info from _ union select Name, Info_3 as Info from _") \
    .view("select distinct * from _") \
    .view("select Name, group_concat(Info, ' ') as Info from _ where Info != '' group by Name")
tab.view("select _.*, coalesce(info.Info, '') as Info from _ left join info on _.Name = info.Name",
        bindings={"info": info_tab})

|Name|Age|Info\_1|Info\_2|Info\_3|Data|Sex|Info|
|-|-|-|-|-|-|-|-|
|\'Lolo\'|19|\'Tall\'|\'New York\'|\'Tall\'|123|\'F\'|\'New York Tall\'|
|\'Mike\'|34|\'Large\'|\'Paris\'|\'Paris\'|268|\'M\'|\'Large Paris\'|
|\'Tobias\'|13|\'Small\'|\'Lisbon\'|\'Hi\'|76|\'M\'|\'Hi Lisbon Small\'|
|\'Luke\'|45|\'Small\'|\'\'|\'Small\'|909|\'M\'|\'Small\'|
|\'Sam\'|52|\'\'|\'Berlin\'|\'Thanks\'|87|\'M\'|\'Berlin Thanks\'|


In [5]:
# https://stackoverflow.com/questions/64793653/how-to-convert-a-dataframe-to-ndarray-of-0s-and-1s
example = """
col_1 col_2
a     4
a     3
b     2
c     2
d     1
b     4
c     1
"""
data = [row.split() for row in example.split("\n") if row != ""][1:]
db = sqltables.Database()
tab = db.load_values(data, column_names=["col_1", "col_2"])
tab

|col\_1|col\_2|
|-|-|
|\'a\'|\'4\'|
|\'a\'|\'3\'|
|\'b\'|\'2\'|
|\'c\'|\'2\'|
|\'d\'|\'1\'|
|\'b\'|\'4\'|
|\'c\'|\'1\'|


In [6]:
crosstab = tab.view("select * from (select distinct col_1 from _), (select distinct col_2 from _)")
counts = tab.view("""
select 
  crosstab.col_1, crosstab.col_2, count(_.col_1) as count 
from crosstab left join _ using (col_1, col_2) group by crosstab.col_1, crosstab.col_2
""", bindings={"crosstab": crosstab})
counts

|col\_1|col\_2|count|
|-|-|-|
|\'a\'|\'1\'|0|
|\'a\'|\'2\'|0|
|\'a\'|\'3\'|1|
|\'a\'|\'4\'|1|
|\'b\'|\'1\'|0|
|\'b\'|\'2\'|1|
|\'b\'|\'3\'|0|
|\'b\'|\'4\'|1|
|\'c\'|\'1\'|1|
|\'c\'|\'2\'|1|
|\'c\'|\'3\'|0|
|\'c\'|\'4\'|0|
|\'d\'|\'1\'|1|
|\'d\'|\'2\'|0|
|\'d\'|\'3\'|0|
|\'d\'|\'4\'|0|


In [7]:
import json

class Json_group_array:
    def __init__(self):
        self.array = []
        
    def step(self, x):
        self.array.append(x)
        
    def finalize(self):
        return json.dumps(self.array)
    
db._conn.create_aggregate("json_group_array", 1, Json_group_array)

In [8]:
[json.loads(x) for [x] in counts.view("select json_group_array(count) from _ group by col_1 order by col_1, col_2")]

[[0, 0, 1, 1], [0, 1, 0, 1], [1, 1, 0, 0], [1, 0, 0, 0]]

In [9]:
# https://stackoverflow.com/questions/64793652/count-top-most-frequent-phrases-in-a-text-column-in-pandas
import re
example = """
Andy | max min | tea | pal
no limit | toy 2011 | hess | mix
Andy | Andy | toy 2011| pal
"""
db = sqltables.Database()
tab = db.load_values(([x] for x in example.split("\n")[1:-1]), column_names=["text"])
tab

|text|
|-|
|\'Andy \| max min \| tea \| pal\'|
|\'no limit \| toy 2011 \| hess \| mix\'|
|\'Andy \| Andy \| toy 2011\| pal\'|


In [10]:
tab2 = db.load_values(([x] for [text] in tab for x in re.split(r"\s*\|\s*", text)), column_names=["text1"])
tab2.view("select text1, count(*) as count from _ group by text1 order by count desc limit 3")

|text1|count|
|-|-|
|\'Andy\'|3|
|\'toy 2011\'|2|
|\'pal\'|2|


In [11]:
# https://stackoverflow.com/questions/64809403/change-the-value-of-a-column-based-on-finding-characters-in-another-column-with
example = """
City - Country
Saddle(Canada) - Other
Dublin - Other
Detroit - USA
Vancouver - Canada
NYC: US - Other
"""
[header, *data] = example.split("\n")[1:-1]
column_names = header.split(" - ")
rows = [x.split(" - ") for x in data]
db = sqltables.Database()
import re, sqlite3
sqlite3.enable_callback_tracebacks(True)
db._conn.create_function("regexp", 2, lambda x, y: bool(re.search(x, y)))
tab = db.load_values(rows, column_names=column_names)
tab

|City|Country|
|-|-|
|\'Saddle\(Canada\)\'|\'Other\'|
|\'Dublin\'|\'Other\'|
|\'Detroit\'|\'USA\'|
|\'Vancouver\'|\'Canada\'|
|\'NYC\: US\'|\'Other\'|


In [12]:
tab.view("select City, regexp('US', City) from _")

|City|regexp\(\'US\'\, City\)|
|-|-|
|\'Saddle\(Canada\)\'|0|
|\'Dublin\'|0|
|\'Detroit\'|0|
|\'Vancouver\'|0|
|\'NYC\: US\'|1|


In [13]:
overrides = db.load_values([
    [0, 'Canada', 'Canada'],
    [1, 'US', 'USA'],
    [2, None, None]
], column_names=["priority", "regex", "Country"])
overrides

|priority|regex|Country|
|-|-|-|
|0|\'Canada\'|\'Canada\'|
|1|\'US\'|\'USA\'|
|2|None|None|


In [None]:
tab.view("select min(priority), City, coalesce(overrides.Country, _.Country) as Country from _, overrides where (overrides.regex is not null and regexp(overrides.regex, _.City)) or overrides.regex is null group by City order by priority", bindings={"overrides": overrides})

In [None]:
squared.view("select cast(count(*) as float), 'foo' as s from _")

In [None]:
squared.view('select _.a as "*a*", * from _,_ as _1')

In [None]:
tab = squared.table("select a from _ where a >= ?", [2])
for [n] in tab:
    [[sum]] = squared.table("select sum(a2) from _ where a <= :n", {"n": n})
    print(f"{n}: {sum}")

In [None]:
for r in tab:
    print(r)

In [None]:
import random

rand_tab = db.load_values([[random.randint(0, 8)] for _ in range(5)], column_names=["u"])
rand_tab

In [None]:
squared.view("select * from _, rand where a < u order by u", bindings={"rand": rand_tab})

In [None]:
[x.statement for x in db._active_iterators]

In [None]:
squared

In [None]:
properties_db = sqlite3.connect("properties.sqlite3")
properties = Table(name="properties", db=properties_db)
sales = Table(name="sales", db=properties_db)

In [None]:
sales.describe(), properties.describe()

In [None]:
property_sales = sales.view("select * from _ join p on _.bbr = p.bbr", p=properties)

In [None]:
property_sales.db.create_function("pow", 2, lambda a, b: a**b)

In [None]:
def compute_stats(property_sales):
    stats1 = property_sales.view("select city,avg(salePrice) as mean,pow(avg(pow(salePrice,2.0))-pow(avg(salePrice),2.0), 0.5) as std from _ group by city")
    stats2 = stats1.view("select *, std/mean as rho from _")
    return stats2

In [None]:
list(compute_stats(property_sales.view("select * from _ limit 10")))

In [None]:
item_types = [x for [x] in properties.view("select distinct itemTypeName from _")]

In [None]:
item_types

In [None]:
item_stats = {
    k: compute_stats(property_sales.view(f"select * from _ where itemTypeName = '{k}'"))
    for k in item_types
}

In [None]:
item_stats["Villa"].columns()

In [None]:
item_stats["Villa"].values()