In [15]:
import os
import sys
import toml
import json
import millify
import gspread
import requests
import calendar
import plotly.express as px
import numpy as np
import pandas as pd
import polars as pl
import datetime as dt
import streamlit as st
from pathlib import Path
from millify import prettify
from lxml.html import fromstring
from streamlit_gsheets import GSheetsConnection
from google.oauth2.service_account import Credentials
from oauth2client.service_account import ServiceAccountCredentials
import scipy

sys.path.insert(0, str(Path(os.getcwd()).parent))

from src import utils, schemas, chart_functions as chart



In [17]:
def load_environment_variables(secret_path):
    return toml.load(secret_path)

def authenticate(secrets, scope, workbook_name):
    credentials_file = json.loads(str(environment['connections']['gsheets']).replace("'", '"').replace('\r\n', '\\r\\n'))
    credentials = ServiceAccountCredentials.from_json_keyfile_dict(credentials_file, scopes=scope)
    client = gspread.authorize(credentials)
    wb = client.open(workbook_name)
    return wb

def pad_data(data, length):
    padded_data = [[None if x == "" else x for x in row] + [None] * (length - len(row)) for row in data]
    # for i in range(len(data)):
        # data[i] = [None if x == "" else x for x in data[i]]
        # data[i] = [row + [None] * (length - len(row)) for row in data]
    return padded_data

def load_data(sheet_name, schema):
    sheet = WORKBOOK.worksheet(sheet_name)
    data = sheet.get()
    headers = data[0]
    padded_data = pad_data(data[1:], len(headers))
    loaded_dataframe = pl.DataFrame(padded_data, schema=schemas.main_schema, orient='row', strict=False)
    return loaded_dataframe

environment = load_environment_variables('../.streamlit/secrets.toml')
scope = environment['scopes']['scope']
WORKBOOK = authenticate(environment, scope, 'NLFB')

main_df = utils.load_data('Main', schema=schemas.get_main_schema(), workbook=WORKBOOK)

In [None]:
main_df.head()

Number,ISBN,Month,Year,Title,Score,Author,Publisher,Pages,Author gender,Pub year,Goodreads score,Our score conversion,variance,Debut?,Translated?,Topics
i64,str,str,i64,str,f64,str,str,i64,str,i64,f64,f64,f64,str,str,str
34,"""9780571376483""","""June""",2024,"""Demon Copperhead""",8.192,"""Barbara Kingsolver""","""Faber & Faber""",560,"""Female""",2023,4.5,4.096,-0.404,"""No""","""No""","""Contemporary, Coming of age, A…"
14,"""9781784744649""","""November""",2022,"""Tomorrow, and Tomorrow, and To…",7.84,"""Gabrielle Zevin""","""Vintage Publishing""",416,"""Female""",2022,4.22,3.92,-0.3,"""No""","""No""","""Contemporary, Romance, Coming …"
31,"""9781838930509""","""March""",2024,"""Pachinko""",7.7,"""Min Jin Lee""","""Bloomsbury Publishing PLC""",560,"""Female""",2017,4.33,3.85,-0.48,"""No""","""No""","""Asia, Japan, Historical, Famil…"
22,"""9780008532772""","""July""",2023,"""Yellowface""",7.66,"""Rebecca F. Kuang""","""HarperCollins Publishers""",336,"""Female""",2023,4.04,3.83,-0.21,"""No""","""No""","""Contemporary, Identity theft, …"
30,"""9781529111798""","""February""",2024,"""I who have never known men""",7.51,"""Jacqueline Harpman""","""Vintage Publishing""",208,"""Female""",1997,4.22,3.755,-0.465,"""No""","""Yes""","""Sci-fi, Dystopian, Feminism, F…"


In [47]:

new_main_df = (
    main_df
    .with_columns(pl.col("Topics").str.split(", "))
    .explode("Topics")
    
)

new_new = new_main_df.group_by(pl.col("Topics")).agg(pl.col("Title").count()).sort(pl.col("Title"), descending=True)

topics_bar = px.bar(new_new, x="Topics", y="Title")

In [37]:
new_main_df.group_by(pl.col("Topics")).agg(pl.col("Title").count()).sort(pl.col("Title"), descending=True)

Topics,Title
str,u32
"""Romance""",16
"""Contemporary""",13
"""Mystery""",9
"""Historical""",8
"""Thriller""",6
…,…
"""Greek mythology""",1
"""Gothic""",1
"""African american""",1
"""Nature""",1


In [52]:
def get_text_from_html_element(url: str, element_id: str) -> str:
    response = requests.get(url)
    soup = fromstring(response.text)
    try:
        element = soup.get_element_by_id(element_id)
        text = str(element.text_content())
    except KeyError:
        text = ""
    return text

In [55]:
meetup = "https://www.meetup.com/20-and-30-somethings-book-club-london/"
# elem = "member-count-link"
elem = "made up id"

In [57]:
response = requests.get(meetup)
soup = fromstring(response.text)

In [62]:
try:
    element = soup.get_element_by_id("member-count-link")
    print(element)
except KeyError:
    print("caught")
finally:
    print("done")

<Element a at 0x21f76baab70>
done


In [100]:
new_main_df = (
    main_df
    .with_columns(pl.col("Topics").str.split(", "))
    .explode("Topics")
    
)

new_new = new_main_df.group_by([pl.col("Publisher"), pl.col("Topics")]).agg(pl.col("Title").count().alias("Count"))



In [157]:
hm = new_new.pivot(index='Topics', on='Publisher').fill_null(0)

In [167]:
import plotly.express as px
fig = px.imshow(hm,
                labels=dict(x="Topic", y="Publisher", color="Count"),
                y=hm['Topics'],
                x=hm.columns,
                # color_continuous_scale='YlOrRd',
                color_continuous_scale='RdPu',
                #text_auto=True,
                width=1000,
                height=1000
               )
fig.update_xaxes(side="top", title="")
fig.update_yaxes(title="")
fig.update_layout(margin={"t":200,"b":50}, yaxis={"dtick":1},  xaxis={"dtick":1})
fig.layout.coloraxis.showscale = False
fig.update_layout(dragmode='pan')

fig.show()

In [168]:
def funny(arg1):
    print(ar1)
    return None

print(type(funny))

<class 'function'>


In [3]:
countries_df = utils.load_data('Data', workbook=WORKBOOK, schema=None)

In [4]:
countries_df.head()

column_0,column_1,column_2,column_3,column_4,column_5,column_6
str,str,str,null,str,null,str
"""Afghanistan""","""AF""","""AFG""",,"""Male""",,"""Yes"""
"""Åland Islands""","""AX""","""ALA""",,"""Female""",,"""No"""
"""Albania""","""AL""","""ALB""",,"""Non binary""",,"""Unknown"""
"""Algeria""","""DZ""","""DZA""",,"""Other""",,
"""American Samoa""","""AS""","""ASM""",,"""Unknown""",,


In [5]:
author_df = utils.load_data('Authors', schema=schemas.author_schema, workbook=WORKBOOK)
map_df = author_df.join(countries_df, left_on='Country of Birth', right_on='column_0', how='left')


In [18]:
map_group = map_df.group_by([pl.col('Country of Birth'), pl.col('column_2').alias('Alpha3Code')]).agg(pl.col('Author Name').count().alias('Count'))
map_group = map_group.with_columns(
    pl.col('Count').cast(int)
)

In [19]:
map_group

Country of Birth,Alpha3Code,Count
str,str,i64
"""China""","""CHN""",1
"""Colombia""","""COL""",1
"""Bangladesh""","""BGD""",1
"""Belgium""","""BEL""",1
"""Sweden""","""SWE""",1
…,…,…
"""Ireland""","""IRL""",3
"""United Kingdom of Great Britai…","""GBR""",11
"""United States of America""","""USA""",14
"""Korea, Republic of""","""KOR""",1


In [22]:
map_fig = px.choropleth(map_group, locations="Alpha3Code",
                color="Count",
                hover_name="Country of Birth",
                color_continuous_scale=px.colors.sequential.Plasma)

map_fig.show()

In [19]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(main_df, x="Pages", nbins=10)
fig.show()

In [35]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

#fig = go.Figure(data=[go.Histogram(x=main_df['Pages']), go.Histogram(x=main_df['Score'])])
fig = make_subplots(rows=2, cols=1, subplot_titles=("Number of Pages Distribution", "Score Distribution"))
fig.add_trace(go.Histogram(x=main_df['Pages'], name="Pages"), row=1, col=1)
fig.add_trace(go.Histogram(x=main_df['Score'], name="Score"), row=2, col=1)
fig.show()

In [4]:
cities = ["Vancouver", "Oslo", "Berlin", "Krakow", "Graz", "Belgrade"]

if all("o" in (counterexample := city) for city in cities):
    print("All city names contain 'o'")
else:
    print(counterexample)

Berlin


In [5]:
all("o" in (counterexample := city) for city in cities)

False

In [14]:
all("o" in [counterexample := city for city in cities])

TypeError: 'bool' object is not iterable