In [1]:
import modin.pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet("tweets-tok.parquet", columns=["tokens"]).join(
    pd.read_parquet("tweets-geo.parquet", columns=["prov_istat_code"])
)


    import ray
    ray.init()

2024-01-22 12:50:00,189	INFO worker.py:1621 -- Started a local Ray instance.


In [3]:
import geopandas as gpd
provinces = gpd.read_file("italy-provinces.geojson")
provinces[provinces["reg_name"].eq("Trentino-Alto Adige/Südtirol")]

Unnamed: 0,prov_name,prov_istat_code_num,prov_acr,reg_name,reg_istat_code,reg_istat_code_num,prov_istat_code,geometry
21,Bolzano/Bozen,21,BZ,Trentino-Alto Adige/Südtirol,4,4,21,"POLYGON ((11.47690 46.36393, 11.47769 46.36325..."
22,Trento,22,TN,Trentino-Alto Adige/Südtirol,4,4,22,"POLYGON ((11.14590 45.71695, 11.14580 45.71538..."


In [4]:
df["prov_istat_code"].eq("021").value_counts()

prov_istat_code
False    5301171
True       18738
Name: count, dtype: int64

In [5]:
df = df.explode("tokens")

In [6]:
# See https://ucrel.lancs.ac.uk/llwizard.html

it_freq = df                                  ["tokens"].value_counts()
st_freq = df[ df["prov_istat_code"].eq("021")]["tokens"].value_counts().reindex(it_freq.index, fill_value=0)
nt_freq = df[~df["prov_istat_code"].eq("021")]["tokens"].value_counts().reindex(it_freq.index, fill_value=0)

# Ok, this is log likelihood and checks out!
log_likelihood = 2 * (
      st_freq * np.log(st_freq / st_freq.sum())
    + nt_freq * np.log(nt_freq / nt_freq.sum())
    - it_freq * np.log(it_freq / it_freq.sum())
)
log_likelihood = log_likelihood.dropna().sort_values(ascending=False)
log_likelihood.name = "log_likelihood"

# Ok, this is percentual difference and checks out!
perc_diff = 100 * (st_freq / st_freq.sum() - nt_freq / nt_freq.sum()) / (nt_freq / nt_freq.sum())
perc_diff.name = "perc_diff"



In [7]:
stats = pd.concat([log_likelihood, perc_diff], axis=1)
stats.sort_values(by=["log_likelihood", "perc_diff"], ascending=False, inplace=True)

In [14]:
with pd.option_context("display.max_rows", 2000):
    display(stats.head(1000))

Unnamed: 0_level_0,log_likelihood,perc_diff
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1
bolzano,3212.865012,37499.62
adige,2410.044718,14091.36
merano,2130.328171,208189.3
trentino,1745.612175,7701.097
#ciaomaschio,1156.3936,33557.21
alto,1106.303602,1102.054
meran,1102.448007,518032.6
selleria,934.336689,1290050.0
#altoadige,809.988409,49075.49
dolomiti,769.24578,7258.096


In [9]:
forms = set(pd.read_parquet("wforms-ann.parquet", columns=[], filters=[
    [("status", ">=", 0)] # NOTE: this implies ( fst_batch | snd_batch ) & !tt_att
]).index)

with pd.option_context("display.max_rows", 1000):
    display(stats[stats.index.isin(forms)])

Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.
Reason: Parquet options that are not currently supported


Unnamed: 0_level_0,log_likelihood,perc_diff
tokens,Unnamed: 1_level_1,Unnamed: 2_level_1
#travel,87.868839,552.725601
#picoftheday,15.207069,332.323004
#amazing,15.164652,1651.434041
#specialguest,12.221229,6117.590845
#see,11.003745,4341.136318
#bellavita,10.292821,3557.40638
#icecream,9.022774,2490.662852
#holiday,8.115908,323.926649
pirlotto,7.963176,1842.997139
#sivax,7.740592,1728.70319
