In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
from configparser import ConfigParser
import sqlite3
config = ConfigParser()
config.read("../config.ini")

['../config.ini']

In [13]:
CONNECTION_STRING = config.get(section='NOTEBOOKS', option='sqlite3_conn_string')
conn = sqlite3.connect(CONNECTION_STRING)

article_analysis_df = pd.read_sql("SELECT * FROM ArticleAnalysis", conn)
article_analysis_df.head(10)

Unnamed: 0,p_articleID,p_articleNo,f_articleID,f_articleNo,p_legislation_id,days_diff,diff_count,similarity_ratio,comments_allowed,total_comments,positivity_score
0,1,1,47429.0,1,1,15.0,85,0.935234,1,134,
1,2,2,47430.0,2,1,15.0,10,0.979424,1,12,
2,4,1,23146.0,1,3,12.0,43,0.780541,1,1,
3,5,2,23147.0,2,3,12.0,143,0.794287,0,0,
4,6,3,23148.0,3,3,12.0,271,0.882559,1,3,
5,7,4,23149.0,4,3,12.0,488,0.070305,0,0,
6,8,5,23150.0,5,3,12.0,292,0.719588,1,4,
7,9,6,23151.0,6,3,12.0,138,0.918103,1,7,
8,10,7,23152.0,7,3,12.0,15,0.981818,1,5,
9,11,8,23153.0,8,3,12.0,606,0.038987,0,0,


## Error Analysis
Find all matchings that have a high starting similarity but suddenly drops to zero,
indicating that an article was added/changed order, thus ruining all comparisons thereafter.

In [36]:
# Find legislations that have large deviations in similarity
article_analysis_df['similarity_diff'] = article_analysis_df.\
                                         groupby("p_legislation_id")['similarity_ratio'].\
                                         transform(lambda x : x.quantile(q=0.9) - x.quantile(q=0.1))

In [37]:
legislations_to_check = article_analysis_df[article_analysis_df['similarity_diff']>0.8]['p_legislation_id'].unique()
legislations_to_check.size

83

In [38]:
article_analysis_df[article_analysis_df['p_legislation_id'].isin(legislations_to_check)].head(100)

Unnamed: 0,p_articleID,p_articleNo,f_articleID,f_articleNo,p_legislation_id,days_diff,diff_count,similarity_ratio,comments_allowed,total_comments,positivity_score,similarity_diff
2,4,1,23146.0,1,3,12.0,43,0.780541,1,1,,0.837635
3,5,2,23147.0,2,3,12.0,143,0.794287,0,0,,0.837635
4,6,3,23148.0,3,3,12.0,271,0.882559,1,3,,0.837635
5,7,4,23149.0,4,3,12.0,488,0.070305,0,0,,0.837635
6,8,5,23150.0,5,3,12.0,292,0.719588,1,4,,0.837635
...,...,...,...,...,...,...,...,...,...,...,...,...
301,312,69,29861.0,69,10,26.0,276,0.030039,0,0,,0.833678
302,313,70,29862.0,70,10,26.0,123,0.014553,0,0,,0.833678
303,314,71,29863.0,71,10,26.0,40,0.265823,0,0,,0.833678
332,343,1,23639.0,1,12,28.0,69,0.932612,1,4,,0.896259


In [43]:
legislations_to_check

array([  3,  10,  12,  13,  15,  19,  20,  27,  31,  33,  37,  38,  41,
        42,  45,  49,  50,  59,  60,  62,  71,  79,  91,  97, 102, 106,
       110, 113, 115, 118, 119, 125, 126, 128, 129, 131, 135, 140, 146,
       149, 154, 173, 193, 245, 246, 264, 303, 329, 366, 410, 436, 438,
       448, 451, 485, 488, 495, 504, 513, 532, 547, 549, 551, 556, 584,
       598, 601, 624, 626, 639, 642, 663, 690, 691, 707, 753, 799, 811,
       812, 818, 829, 839, 880], dtype=int64)

Manually check `legislations_to_check` in order to add article mappings

In [3]:
# p_legislation_id, p_articleNo, f_articleNo
article_mappings: list[tuple[int,int,int]]
article_mappings = [
    (3,8,999), # Dropped, mapping to 999
    (3,10,12),
    (10,19,64),
    (12,31,37),
    (13,20,21),
    (15,20,21),
    (19,16,999), # Article 16 in Consultation was Dropped
    (19,17,16),
    (20,18,20),
    (27,24,25),
    (33,9,999),  # Article 9 was dropped
    (33,10,9),
    (37,66,68),
    (37,67,71),
    (38,13,14),
    (42,17,999), # Article 17 was dropped
    (42,18,17),  
    (45,28,29),
    (49,15,999),  # Article 15 was dropped
    (49,16,15),
    (59,13,99),
    (59,24,22),
    (60,15,16),
    (71,16,17),
    (79,54,55),
    (91,23,31),
    (97,43,999),
    (97,44,43),
    (106,15,31),
    (110,14,15),
    (113,9,10),
    (115,60,68),
    (118,76,999),
    (118,76,75),
    (119,10,13),
    (125,15,16),
    (126,43,44),
    (128,10,12),
    (131,12,999),
    (131,13,12),
    (135,51,52),
    (140,31,999),
    (140,32,31),
    (140,33,999),
    (140,34,32),
    (154,67,72),
    (303,30,999),
    (303,32,30),
    (366,45,999),
    (366,52,49),
    (410,25,999),
    (436,23,24),
    (448,20,21),
    (451,10,13),
    (485,26,28),
    (495,19,999),
    (495,22,19),
    (513,7,6),
    (532,17,999),
    (532,18,17),
    (547,42,999),
    (556,73,73),
    (584,10,999),
    (584,11,7),
    (598,20,999),
    (598,21,20),
    (598,29,29),
    (598,32,38),
    (639,10,999),
    (639,11,12),
    (642,22,999),
    (642,24,23),
    (663,62,999),
    (663,64,62),
    (707,23,29),
    (753,37,39),
    (799,48,49),
    (829,19,21)
]

Save Article Mappings to **DB**

In [7]:
from data_objects.legislation import ArticleMapping
from sqlalchemy.orm import Session
from sqlalchemy import create_engine

In [8]:
sqlalchemy_conn_string = config.get(section='NOTEBOOKS', option='db_file')
engine = create_engine(sqlalchemy_conn_string)

In [10]:
with Session(engine) as sess:
    for leg_id, p_articleNo, f_articleNo in article_mappings:
        articleMappingObj = ArticleMapping(legislation_id=leg_id,
                                           public_consultation_article_no=p_articleNo,
                                           final_legislation_article_no=f_articleNo)
        sess.add(articleMappingObj)
    sess.commit()