In [1]:
%pip install scikit-learn sklearn_crfsuite

Defaulting to user installation because normal site-packages is not writeable
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.10-cp312-cp312-win_amd64.whl.metadata (4.3 kB)
Collecting tabulate (from sklearn_crfsuite)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Downloading python_crfsuite-0.9.10-cp312-cp312-win_amd64.whl (154 kB)
   ---------------------------------------- 0.0/154.7 kB ? eta -:--:--
   ------------- -------------------------- 51.2/154.7 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 154.7/154.7 kB 2.3 MB/s eta 0:00:00
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: python-crfsuite, tabulate, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn_crfsuite-0.3.6 tabulate-0.9

In [2]:
def transform_dataset(page_dataset, for_inference):
    labeled_text_dataset = []
    for page in page_dataset:
        task_id = page["taskId"]
        page_words = page["representativeData"]["page_data_words"]
        
        geo_dictionary = {}
        if not for_inference:
            page_answers = page.get("answers")
            for page_answer in page_answers[0]["answer"]:
                geo_label = page_answer["id"]
                for geo_part in page_answer["data"]:
                    for index in range(geo_part["start"], geo_part["end"]):
                        geo_dictionary[index] = geo_label
        
        labeled_text = []
        for word_index, word in enumerate(page_words):
            word_label = "0" if for_inference else geo_dictionary.get(word_index, "O")
            labeled_text.append((word, word_label))
            
        labeled_text_dataset.append((task_id, labeled_text))
    
    return labeled_text_dataset

In [8]:
import json

def get_labeled_dataset(dataset_path, for_inference=False):
    with open(dataset_path, encoding="utf-8") as json_dataset:
        dataset = json.load(json_dataset)
        
    labeled_dataset = transform_dataset(dataset["data"]["results"], for_inference)
    return labeled_dataset

In [9]:
def word2features(text, i):
    word = text[i][0]
    
    features = {
        "word.lower()": word.lower(),
        "word[-2:]": word[-2:],
        "word[-3:]": word[-3:],
        "word[:2]": word[:2],
        "word[:3]": word[:3],
        "word.isupper()": word.isupper(),
        "word.istitle()": word.istitle(),
        "word.isdigit()": word.isdigit(),
        "word.endswithdot": word.endswith(".")
    }
    
    for offset in [-3, -2, -1, 1, 2, 3]:
        if 0 <= i + offset < len(text):
            neighbor_word = text[i + offset][0]
            features.update({
                f"{offset}:word.lower()": neighbor_word.lower(),
                f"{offset}:word.istitle()": neighbor_word.istitle(),
                f"{offset}:word.isupper()": neighbor_word.isupper(),
                f"{offset}:word.endswithdot": neighbor_word.endswith(".")
            })
        else:
            features[f"offset_{offset}_limit"] = True
        
    return features

In [10]:
def text2features(text):
    return [word2features(text, index) for index in range(len(text))]

def text2labels(text):
    return [label for _, label in text]

In [11]:
def get_validation_result(X_validation, y_pred):
    validation_result = []
    
    for ((task_id, _), predictions) in zip(X_validation, y_pred):
        answers = {}
        current_label = None
        start_index = None
        
        for current_index, label in enumerate(predictions):
            if label == current_label:
                continue
            else:
                if current_label is not None and current_label != "O":
                    if current_label not in answers:
                        answers[current_label] = []
                    answers[current_label].append({"start": start_index, "end": current_index})
                
                if label != "0":
                    current_label = label
                    start_index = current_index
                else:
                    current_label = None
    
        if current_label is not None and current_label != "O":
            if current_label not in answers:
                answers[current_label] = []
            answers[current_label].append({"start": start_index, "end": len(predictions)})
        
        validation_answers = []
        for label, segments in answers.items():
            validation_answers.append({"id": label, "data": segments})
        
        validation_result.append({
            "taskId": task_id,
            "answer": validation_answers
        })
        
    return validation_result

In [12]:
train_dataset = get_labeled_dataset("../jsons/train_geo_extractor.json")

X_train = [text2features(text) for _, text in train_dataset]
y_train = [text2labels(text) for _, text in train_dataset]

In [13]:
import sklearn_crfsuite

crf_model = sklearn_crfsuite.CRF(
    algorithm='ap',
    max_iterations=100,
    all_possible_transitions=False
)

try:
    crf_model.fit(X_train, y_train)
except AttributeError:
    pass

In [15]:
test_dataset = get_labeled_dataset("../jsons/test_geo_extractor.json")

X_test = [text2features(text) for _, text in test_dataset]
y_test = [text2labels(text) for _, text in test_dataset]

In [16]:
from sklearn.metrics import classification_report, matthews_corrcoef

y_pred = crf_model.predict(X_test)
y_test_flat = [label for text in y_test for label in text]
y_pred_flat = [label for text in y_pred for label in text]

report = classification_report(y_test_flat, y_pred_flat)
matthews_correlation_coefficient = matthews_corrcoef(y_test_flat, y_pred_flat)

print(report)
print(f"Matthews Correlation Coefficient: {matthews_correlation_coefficient}")

                   precision    recall  f1-score   support

                O       0.99      0.99      0.99     62822
     central_city       0.44      0.36      0.40       184
      geo_address       0.89      0.75      0.82      1040
     geo_building       0.83      0.74      0.78       453
         geo_city       0.84      0.84      0.84      1433
     geo_district       0.87      0.78      0.82       387
geo_microdistrict       0.60      0.53      0.56       382
       geo_region       0.99      0.99      0.99      1733
geo_region_oblast       0.86      0.87      0.86       297
       geo_street       0.81      0.79      0.80      1059

         accuracy                           0.98     69790
        macro avg       0.81      0.76      0.79     69790
     weighted avg       0.98      0.98      0.98     69790

Matthews Correlation Coefficient: 0.8746781148689454


In [18]:
validation_dataset = get_labeled_dataset("../jsons/val_no_answer_geo_extractor.json", for_inference=True)

X_validation = [(task_id, text2features(text)) for task_id, text in validation_dataset]

X_validation_features = [text_features for _, text_features in X_validation]

y_pred = crf_model.predict(X_validation_features)

In [19]:
validation_result = get_validation_result(X_validation, y_pred)

with open("myres.json", "w", encoding="utf-8") as file:
    json.dump(validation_result, file, ensure_ascii=False, indent=4)

print("Validation result has been saved!")

Validation result has been saved!
