In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install --no-deps spacy[cuda101]
!pip install --no-deps thinc
!python -m spacy download ru_core_news_md

In [None]:
!pip install thinc-gpu-ops

In [None]:
!unzip /content/model_10_epochs.zip

In [None]:
# @title Путь к датасету в формате json
path = "/content/ner_data_normalized.json" # @param {type:"string"}


In [None]:
import spacy
from spacy.training.example import Example
import json
import random

nlp = spacy.load('ru_core_news_md', exclude=['parser', 'ner'])
ner = nlp.add_pipe('ner')

In [None]:
with open(path, "r", encoding="utf-8") as f:
    data = json.load(f)

    # Iterate over the data and create training examples
    entity_examples = []
    for example in data["data"]:
        text = example["video_info"]
        entities = example["entities"]

        # Create a list of entities for the example
        entity_spans = []
        for entity in entities:
            start = entity["offset"]
            end = start + entity["length"]
            label = entity["label"]
            entity_spans.append((start, end, label))

        # Create a training example with text and entities
        example = Example.from_dict(nlp.make_doc(text), {"entities": entity_spans})

        entity_examples.append(example)

# Add the examples to the entity recognizer
ner.initialize(lambda: entity_examples)

# Disable other pipeline components except NER during training
pipe_exceptions = ["ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

with nlp.select_pipes(disable=other_pipes):
    # Initialize the training loop
    optimizer = nlp.initialize()

    for iteration in range(30):
        losses = {}
        # Shuffle the training data before each iteration
        random.shuffle(entity_examples)
        for batch in spacy.util.minibatch(entity_examples, size=4):
            nlp.update(batch, losses=losses, sgd=optimizer)

        print(f"Iteration {iteration+1}: Losses - {losses}")

# Save the trained model
nlp.to_disk("ner_model")


In [None]:
!zip -r /content/model_30_epochs.zip /content/ner_model

In [None]:
nlp = spacy.load('/content/content/ner_model')
video_url = "https://rutube.ru/video/237559720b0b079f5fb621ef08092a59/" # @param {type:"string"}
import requests
from bs4 import BeautifulSoup

# Replace VIDEO_URL with the URL of the rutube video
VIDEO_URL = video_url

title = ""
description = ""

def parse_description():
    # Send GET request to the video page
    response = requests.get(VIDEO_URL)
    if response.status_code == 200:
        # Parse HTML content
        soup = BeautifulSoup(response.content, "html.parser")
        # Find the description element
        description_element = soup.select_one(".pen-videopage-description")
        title_element = soup.select_one(".video-pageinfo-container-module__videoTitle")
        if description_element:
            title = title_element.text.strip()
            description = description_element.text.strip()
            return(title + description)
        else:
            print("Description not found")
    else:
        print("Failed to retrieve video page")


doc = nlp(parse_description())
doc.ents

from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)


In [None]:
def form_ents(ents, text):
  return [{'label':i['label'], 'offset':i['start'], 'length':i['end']-i['start'], 'segment':text[i['start']:i['end']]} for i in ents]

def text_to_notm(text):
  doc = nlp(text)
  return [text, form_ents(doc.to_json()['ents'], text)]



In [None]:
import pandas as pd
import csv
import json
pd.options.display.max_colwidth = 300
path = '/content/ner_data_test.csv' # @param {type:"string"}

def csv_to_csv(path):
  with open('new_ner_data_test.csv', 'w+', newline='', encoding='utf-8') as file_csv:
    writer = csv.writer(file_csv)
    writer.writerows([["video_info", "entities_prediction"]])
    data = pd.read_csv(path)
    for index, row in data.iterrows():
      text = row['video_info']
      res = text_to_notm(text)
      result = res[1]
      result = json.dumps(result, ensure_ascii=False)
      result = result.replace('\"', '\"')
      result = result.replace(',', '\,')
      result = result.replace('}\, {', '}, {')
      result = result[1:-1]
      writer.writerows([[res[0], result]])
    file_csv.close()



csv_to_csv(path)