In [1]:
from ptracking.database import cursor

# Extract the pre-processed text from the database
with cursor() as cur:
    cur.execute("SELECT petition_id, processed_content, signatures FROM petition WHERE state = 'closed'")
    content = cur.fetchall()

processed_text = [text for _, text, _ in content]

In [2]:
import gensim.corpora as corpora
from gensim.models import TfidfModel

dictionary = corpora.Dictionary(processed_text)

# Create a separate corpus for TFIDF so we can filter out low value words
tfidf_corpus = [dictionary.doc2bow(words) for words in processed_text]
tfidf = TfidfModel(tfidf_corpus, dictionary)

#filter low value words TODO find a good parameter value
low_value = 0.1
petition_dict = {}

# Here we create a new corpus for LDA with the low value words removed from each documents BoW.
lda_corpus = []
for petition_id, text, signatures in content:
    bow = dictionary.doc2bow(text)
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    new_bow = [b for b in bow if b[0] not in low_value_words]
    lda_corpus.append(new_bow)
    petition_dict[petition_id] = (new_bow, signatures)



In [3]:
from gensim.models import LdaMulticore
from pprint import pprint

# number of topics
num_topics = 10
# Build LDA model
lda_model = LdaMulticore(corpus=lda_corpus,
                        id2word=dictionary,
                        num_topics=num_topics,
                        alpha=0.1,
                        eta=0.1)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())

[(0,
  '0.009*"dog" + 0.009*"pay" + 0.008*"business" + 0.007*"child" + '
  '0.006*"worker" + 0.006*"support" + 0.005*"year" + 0.005*"service" + '
  '0.004*"family" + 0.004*"people"'),
 (1,
  '0.008*"family" + 0.006*"student" + 0.006*"pay" + 0.005*"woman" + '
  '0.005*"public" + 0.005*"covid" + 0.004*"day" + 0.004*"travel" + '
  '0.004*"time" + 0.004*"work"'),
 (2,
  '0.015*"child" + 0.007*"school" + 0.007*"parent" + 0.005*"work" + '
  '0.005*"health" + 0.005*"home" + 0.005*"woman" + 0.004*"animal" + '
  '0.004*"staff" + 0.004*"test"'),
 (3,
  '0.007*"year" + 0.005*"use" + 0.005*"care" + 0.005*"child" + 0.005*"social" '
  '+ 0.004*"work" + 0.004*"ban" + 0.004*"increase" + 0.004*"school" + '
  '0.003*"sentence"'),
 (4,
  '0.015*"child" + 0.009*"school" + 0.007*"animal" + 0.006*"student" + '
  '0.005*"deal" + 0.005*"vote" + 0.005*"pay" + 0.005*"leave" + 0.004*"mental" '
  '+ 0.004*"parent"'),
 (5,
  '0.014*"child" + 0.011*"school" + 0.008*"test" + 0.007*"vote" + '
  '0.007*"people" + 0.00

In [4]:
import pandas as pd

rows = []
for petition_id, (bow, signatures) in petition_dict.items():
    topics = lda_model.get_document_topics(bow, minimum_probability=0)
    rows.append((petition_id, signatures, *[prob for _, prob in topics]))

columns = ["petition_id", "signatures"] + ["topic_" + str(i) for i in range(num_topics)]
df = pd.DataFrame(rows, columns=columns)
df.set_index('petition_id', inplace=True)
df.head()

Unnamed: 0_level_0,signatures,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
petition_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
274448,10,0.943725,0.006253,0.006252,0.006253,0.006253,0.006252,0.006253,0.006253,0.006253,0.006252
202237,11,0.004557,0.004557,0.004557,0.424001,0.004557,0.004557,0.539545,0.004557,0.004557,0.004557
253250,10,0.002785,0.002785,0.002785,0.002785,0.637568,0.002785,0.002785,0.002785,0.002785,0.340154
271005,10,0.004552,0.004553,0.004553,0.004552,0.004552,0.004552,0.959029,0.004552,0.004552,0.004552
276198,10,0.003849,0.003849,0.003849,0.003849,0.549444,0.003849,0.003849,0.003849,0.419766,0.003849


In [5]:
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

k = 5
clustering = KNeighborsRegressor(n_neighbors=k)

topics = df.loc[:, "topic_0":"topic_" + str(num_topics - 1)].to_numpy()
petition_ids = df.index.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(topics, petition_ids, test_size=0.2)
clustering.fit(X_train, y_train)

y_pred = []
y_true = []
for p_topics, p_id in zip(X_test, y_test):
    distance, indices = clustering.kneighbors(p_topics.reshape(1, -1))
    petition_ids_of_neighbours = [y_train[index] for index in indices][0]

    # compute some average score for the number of signatures
    sum = 0
    for neighbour_id in petition_ids_of_neighbours:
        sum += df.loc[neighbour_id].signatures
    prediction = sum / k

    # compare the average to the actual
    actual = df.loc[p_id].signatures
    y_pred.append(prediction)
    y_true.append(actual)
    print(f"Petition ID: {p_id}, Nearest Neighbours: {petition_ids_of_neighbours}")
    print(f"kNN Prediction: {prediction}, actual signatures: {actual}")


Petition ID: 556370, Nearest Neighbours: [584137 552306 211614 230286 328754]
kNN Prediction: 26616.6, actual signatures: 38.0
Petition ID: 313328, Nearest Neighbours: [307972 302965 549000 575204 278149]
kNN Prediction: 1971.4, actual signatures: 98.0
Petition ID: 216443, Nearest Neighbours: [324447 219748 203572 574231 243812]
kNN Prediction: 3402.2, actual signatures: 17.0
Petition ID: 586143, Nearest Neighbours: [326361 269717 220604 325644 236706]
kNN Prediction: 159.6, actual signatures: 11454.0
Petition ID: 231147, Nearest Neighbours: [305167 580595 576893 228842 229061]
kNN Prediction: 1112.8, actual signatures: 307897.0
Petition ID: 565358, Nearest Neighbours: [232667 328617 202537 268618 272188]
kNN Prediction: 795.0, actual signatures: 159.0
Petition ID: 268666, Nearest Neighbours: [200011 235748 467157 212795 262123]
kNN Prediction: 147.2, actual signatures: 10.0
Petition ID: 302597, Nearest Neighbours: [270588 574623 559089 561506 313416]
kNN Prediction: 1513.0, actual sig

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print(mean_squared_error(y_true, y_pred))
print(mean_absolute_error(y_true, y_pred))

14645719231.246471
12421.633699115046
