### Idea
1. word2vec => vector for each word
2. phrase detection (https://radimrehurek.com/gensim/models/phrases.html)
3. (retrain?)
4. tag words with NER tags
5. simple classification model

### Other ideas
1. create NER tagged wikipedia text by links to pages with wikidata pages (!)
2. TFIDF(? or such) to find interesting/important terms
3. https://towardsdatascience.com/wikipedia-data-science-working-with-the-worlds-largest-encyclopedia-c08efbac5f5c

In [5]:
import mwparserfromhell as mwp
import re, json, time
from requests import Session
from hebtokenizer import tokenize
from pprint import pprint
# from cachetools import LFUCache, cached
import pandas as pd
from yaml import load, Loader
from itertools import chain
from more_itertools import split_after, split_when
from collections import Counter
from yap_tools import *
from apis import * 

In [3]:
s = Session()
s.headers={"User-Agent": "WikiTagger/0.0 (alexzabbey@gmail.com) Python/Requests/3.8"}

In [6]:
def parse_wiki_page(page_title):
    wiki = mwp.parse(get_wp_fulltext(page_title))
    # remove files
    for l in wiki.filter_wikilinks():
        if l.startswith("[[קטגוריה") or l.startswith("[[קובץ"):
            wiki.remove(l)
    # remove extrenal links section
    wiki.remove(wiki.get_sections(matches=r"קישורים חיצוניים"))
    # remove headings
    for h in wiki.filter_headings():
        wiki.remove(h)
    # replace tags with text
    for x in wiki.filter_tags():
        index = wiki.index(x)
        wiki.remove(x)
        wiki.insert(index, x.contents)
    return wiki

wiki = parse_wiki_page("תקומה")
    
# further reading
# photo gallery - https://he.wikipedia.org/wiki/%D7%AA%D7%A7%D7%95%D7%9E%D7%94/
# tables - https://he.wikipedia.org/wiki/%D7%92%D7%9C%D7%A2%D7%93_%D7%A7%D7%9E%D7%97%D7%99

got full wikipedia text from page תקומה


In [10]:
wiki.text()

AttributeError: 'Wikicode' object has no attribute 'text'

In [5]:
with open("tags.yaml", "r", encoding="utf-8") as f:
    tags = load(f, Loader=Loader)
    
for k, v in tags.items():
    if type(v) == list:
        tags[k] = {"include": v, "exclude": None, "not": None}
    else:
        tags[k].setdefault("exclude", None)
        tags[k].setdefault("not", None)

{'PERSON': {'include': ['Q5', 'Q95074'], 'exclude': None, 'not': None},
 'NORP': {'include': ['Q49773', 'Q1530022', 'Q41710'],
  'exclude': None,
  'not': None},
 'FAC': {'include': ['Q811979', 'Q121359'],
  'not': ['Q1068715'],
  'exclude': None},
 'ORG': {'include': ['Q783794'],
  'exclude': ['Q2097994', 'Q7210356'],
  'not': None},
 'GPE': {'include': ['Q15642541', 'Q486972', 'Q28108'],
  'exclude': None,
  'not': None},
 'LOC': {'include': ['Q27096220'], 'exclude': None, 'not': None},
 'EVENT': {'include': ['Q13418847', 'Q8065', 'Q1656682', 'Q350604'],
  'exclude': None,
  'not': None},
 'WORK_OF_ART': {'include': ['Q17537576', 'Q732577'],
  'exclude': ['Q14897293'],
  'not': ['Q2135465']},
 'LAW': {'include': ['Q3150005'], 'exclude': None, 'not': None},
 'LANGUAGE': {'include': ['Q315', 'Q17376908'], 'exclude': None, 'not': None},
 'DATE': {'include': ['Q577', 'Q3186692', 'Q14795564', 'Q1790144'],
  'exclude': None,
  'not': None}}

In [36]:
titles = [str(l.title) for l in wiki.filter_wikilinks()]
query = f"""SELECT ?item ?tag (COUNT(?superClass) AS ?count) WHERE {{
  VALUES ?title {{
    {" ".join([f'"{t}"@he' for t in titles])}
  }}
  VALUES ?superClass {{
 { " ".join(["wd:"+x for x in  sum([v["include"] for v in tags.values()], [])])}
  }}
  ?sitelink schema:about ?item;
    schema:isPartOf <https://he.wikipedia.org/>;
    schema:name ?title.
    ?item wdt:P31/wdt:P279* ?superClass.
   BIND(COALESCE(
    {", ".join(["IF(" + " || ".join([f"?superClass = wd:{q}" for q in v["include"]]) + f', "{k}", 1/0)' for k, v in tags.items()]) }
    ) AS ?tag).
}}
GROUP BY ?item ?tag"""

In [38]:
print(query)

SELECT ?item ?tag (COUNT(?superClass) AS ?count) WHERE {
  VALUES ?title {
    "שבועון"@he "קונדה נאסט (קבוצת מדיה)"@he "דייוויד רמניק"@he "אנגלית אמריקנית"@he "ניו יורק"@he "2017"@he "ארצות הברית"@he "1925"@he "אנגלית"@he "כתב עת"@he "ארצות הברית"@he "ניו יורק"@he "ביקורת אומנות"@he "מסה (חיבור עיוני)"@he "שירה"@he "סיפורת"@he "ניו יורק"@he "קריקטורה"@he "17 בפברואר"@he "1925"@he "21 בפברואר"@he "ניו יורק טיימס"@he "הומור"@he "לייף"@he "1951"@he "מלחמת העולם השנייה"@he "הירושימה"@he "המאה העשרים"@he "המאה העשרים ואחת"@he "אליס מונרו"@he "הארוקי מורקמי"@he "ולדימיר נבוקוב"@he "פיליפ רות"@he "ג'רום דייוויד סלינג'ר"@he "ג'ון אפדייק"@he "ריצ'רד ייטס"@he "שירלי ג'קסון"@he "סוריאליזם"@he "אוונגליזם"@he "תסמונת מינכהאוזן באמצעות שליח"@he "אברהם בורג"@he "ארנסט המינגוויי"@he "מרלון ברנדו"@he "הוליווד"@he "מתמטיקה"@he "ויליאם שון"@he "1951"@he "1987"@he "רוברט גוטליב"@he "1987"@he "1992"@he "טינה בראון"@he "1992"@he "1998"@he "צבע"@he "ניו יורק טיימס"@he "צילום"@he "1985"@he "1998"@he "שנות הת

In [24]:
sparql_query(query)

{'head': {'vars': ['item', 'tag', 'count']},
 'results': {'bindings': [{'item': {'type': 'uri',
     'value': 'http://www.wikidata.org/entity/Q11934'},
    'tag': {'type': 'literal', 'value': 'PERSON'},
    'count': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'type': 'literal',
     'value': '3'}},
   {'item': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q11934'},
    'tag': {'type': 'literal', 'value': 'WORK_OF_ART'},
    'count': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer',
     'type': 'literal',
     'value': '3'}}]}}

In [251]:
l = [get_wdid_from_wp(str(t)) for t in [l.title for l in wiki.filter_wikilinks()]]

[False,
 False,
 'Q1176307',
 'Q7976',
 'Q60',
 'Q25290',
 'Q30',
 'Q18107',
 'Q1860',
 'Q41298',
 'Q30',
 'Q60',
 False,
 'Q35760',
 'Q482',
 'Q8253',
 'Q60',
 'Q482919',
 'Q2341',
 'Q18107',
 'Q2350',
 'Q9684',
 'Q35874',
 'Q463198',
 'Q18591',
 'Q362',
 'Q34664',
 False,
 False,
 'Q234819',
 False,
 'Q36591',
 'Q187019',
 'Q79904',
 'Q105756',
 'Q544611',
 'Q239910',
 'Q39427',
 'Q194253',
 'Q1414765',
 'Q736237',
 'Q23434',
 'Q34012',
 'Q34006',
 'Q395',
 False,
 'Q18591',
 'Q2429',
 False,
 'Q2429',
 'Q2060',
 False,
 'Q2060',
 'Q2089',
 'Q1075',
 'Q9684',
 'Q11633',
 'Q2431',
 'Q2089',
 False,
 'Q75',
 'Q2796',
 'Q318429',
 False,
 'Q2004',
 'Q1986',
 'Q99',
 'Q2019',
 'Q41298',
 False,
 'Q30',
 'Q7163',
 'Q41298',
 'Q7278',
 'Q30',
 'Q2014',
 'Q22316',
 'Q207',
 'Q2997',
 'Q871232',
 'Q2004',
 'Q29552',
 'Q76',
 'Q29468',
 'Q10390',
 'Q10806',
 'Q1988',
 'Q3658608',
 'Q325945',
 'Q23445',
 'Q23901739',
 'Q9128',
 'Q435792',
 'Q545449',
 'Q334740',
 'Q11209',
 'Q794',
 'Q482919',

In [199]:
pd.DataFrame.from_dict(wp_title_to_wdid, orient="index").to_dict()

{0: {('שבועון',): False,
  ('קונדה נאסט (קבוצת מדיה)',): False,
  ('דייוויד רמניק',): 'Q1176307',
  ('אנגלית אמריקנית',): 'Q7976',
  ('ניו יורק',): 'Q60',
  ('2017',): 'Q25290',
  ('ארצות הברית',): 'Q30',
  ('1925',): 'Q18107',
  ('אנגלית',): 'Q1860',
  ('כתב עת',): 'Q41298',
  ('ביקורת אומנות',): False,
  ('מסה (חיבור עיוני)',): 'Q35760',
  ('שירה',): 'Q482',
  ('סיפורת',): 'Q8253',
  ('קריקטורה',): 'Q482919',
  ('17 בפברואר',): 'Q2341',
  ('21 בפברואר',): 'Q2350',
  ('ניו יורק טיימס',): 'Q9684',
  ('הומור',): 'Q35874',
  ('לייף',): 'Q463198',
  ('1951',): 'Q18591',
  ('מלחמת העולם השנייה',): 'Q362',
  ('הירושימה',): 'Q34664',
  ('המאה העשרים',): False,
  ('המאה העשרים ואחת',): False,
  ('אליס מונרו',): 'Q234819',
  ('הארוקי מורקמי',): False,
  ('ולדימיר נבוקוב',): 'Q36591',
  ('פיליפ רות',): 'Q187019',
  ("ג'רום דייוויד סלינג'ר",): 'Q79904',
  ("ג'ון אפדייק",): 'Q105756',
  ("ריצ'רד ייטס",): 'Q544611',
  ("שירלי ג'קסון",): 'Q239910',
  ('סוריאליזם',): 'Q39427',
  ('אוונגליזם',): 'Q19

In [5]:
def graphql_query(query):
    # posting to GraphQL server
    res = s.post(
        "https://tools.wmflabs.org/tptools/wdql.php", json={"query": "{" + query + "}"},
    )
    res.raise_for_status()
    return res.json()

In [6]:
def instance_of(Q_list):
    query = ""
    for Q in Q_list:
        query += (
            f"""{Q}: item(id: "{Q}")"""
            + """ {statements(propertyIds: "P31") { mainsnak { ... on PropertyValueSnak { value { ... on Entity { id }}}}}}
            """
        )

    j = graphql_query(query)
    return {
        k: [i["mainsnak"]["value"]["id"] for i in j["data"][k]["statements"]]
        for k in j["data"].keys()
    }

In [7]:
def sparql_query(query, labels=False, without_roots=False):
    url = "https://query.wikidata.org/sparql"
    if without_roots:
        query = query.replace("*", "+")
    res = s.get(url, params={"query": query, "format":"json"})
    res.raise_for_status()
    j = res.json()
    var = [var for var in j["head"]["vars"] if "Label" not in var][0]
    if labels:
        result = [(d[var]["value"].split("/")[-1], d[var + "Label"]["value"]) for d in j["results"]["bindings"]]
    else:
        result = list(set([x[var]["value"].split("/")[-1] for x in j["results"]["bindings"]]))
#     if without_roots:
#         result = list(result - set(roots))
    return result


In [8]:
def get_subclasses(roots, exclude=None):
    # SPARQL
    if roots == None:
        return []
    def build_query(roots):
        return "SELECT ?subclass WHERE { VALUES ?roots { " + " ".join(["wd:"+r for r in roots]) + " } ?subclass wdt:P279* ?roots.}"
    result = sparql_query(build_query(roots))
    if exclude:
        result = set(result) - set(sparql_query(build_query(exclude)))
    return result

In [9]:
def quick_eval(q):
    query = "SELECT ?superclassLabel ?superclass WHERE { wd:" + q + """ wdt:P279 ?superclass.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }"""
    label_query = 'item(id:"'+ q +'") {label(language:"en") { text } }'
    print(graphql_query(label_query)["data"]["item"]["label"]["text"])
    superclasses = sparql_query(query, labels=True)
    d = {}
    for Q, label in superclasses:
        d[(Q, label)] = [k for k, v in tags.items() if Q in v["include"]]
    return d

In [10]:
def most_popular(l):
    l = [x[1] for x in l]  
    return Counter(l).most_common(1)[0][0]

In [99]:
def split_df_by_punct(df):
    l = list(split_after(df.to_records(index=False).tolist(), lambda x: x[0][-1] == "."))
    fixed = []
    for sublist in l:
        if sublist[-1][0] == ".": fixed.append(sublist[:-1])
        else:
            sublist[-1] = (list(sublist[-1])[0][:-1], *sublist[-1][1:])
            fixed.append(sublist)
    return fixed

Get Wikipedia page

tag all links and save

In [None]:
def title_to_tag(wiki, tags):
    # wp_title -> wdid -> instance -> tag
    # pair wdid with wp title
    for t in [l.title for l in wiki.filter_wikilinks()]:
        get_wdid_from_wp(str(t))

    Q_list = list(set(wp_title_to_wdid.values()) - {False})
    wdid_to_instance = instance_of(Q_list)
    title_to_instance = {}
    for k, v in wp_title_to_wdid.items():
        if v != False:
            title_to_instance[k[0]] = wdid_to_instance[v]

    tagged_titles = {}
    for title, Q_list in title_to_instance.items():
        options = []
        for Q in Q_list:
            for tag in tags:
                if Q in tags[tag]["not"]:
                    break
                if Q in tags[tag]["include"]:
                    options.append((Q, tag))
        tagged_titles[title] = options
    tagged_titles = {k: most_popular(v) for k, v in tagged_titles.items() if len(v) > 0}


In [17]:
# pair wdid with wp title
for t in [l.title for l in wiki.filter_wikilinks()]:
    get_wdid_from_wp(str(t))

In [18]:
Q_list = list(set(wp_title_to_wdid.values()) - {False})
wdid_to_instance = instance_of(Q_list)

In [19]:
title_to_instance = {}
for k, v in wp_title_to_wdid.items():
    if v != False:
        title_to_instance[k[0]] = wdid_to_instance[v]

pprint(title_to_instance)

{'1 בנובמבר': ['Q14795564'],
 '13 באוקטובר': ['Q14795564'],
 '17 בפברואר': ['Q14795564'],
 '1882': ['Q577'],
 '1925': ['Q577'],
 '1944': ['Q577', 'Q19828'],
 '1951': ['Q577'],
 '1956': ['Q577', 'Q19828'],
 '1958': ['Q577'],
 '1968': ['Q577', 'Q19828'],
 '1973': ['Q577'],
 '1976': ['Q577', 'Q217019'],
 '1984': ['Q577', 'Q19828'],
 '1985': ['Q577'],
 '1987': ['Q577'],
 '1989': ['Q577'],
 '1992': ['Q577', 'Q19828'],
 '1993': ['Q577'],
 '1997': ['Q577'],
 '1998': ['Q577'],
 '2001': ['Q235729'],
 '2002': ['Q235729'],
 '2003': ['Q235729'],
 '2004': ['Q19828'],
 '2005': ['Q235729'],
 '2006': ['Q235729'],
 '2007': ['Q235729'],
 '2008': ['Q19828'],
 '2017': ['Q235729', 'Q235670'],
 '21 בפברואר': ['Q14795564'],
 '29 במרץ': ['Q14795564'],
 '8 באוגוסט': ['Q14795564'],
 'iPad': ['Q811701'],
 'אברהם בורג': ['Q5'],
 'אדפטיישן': ['Q11424'],
 'אוונגליזם': ['Q995347'],
 'אור': [],
 'איור': [],
 'אינטרנט': ['Q11224256', 'Q1301371'],
 'איראן': ['Q6256', 'Q3624078', 'Q672729'],
 'אליס מונרו': ['Q5'],
 'אנג

In [21]:
with open("tags.yaml", "r", encoding="utf-8") as f:
    tags = load(f, Loader=Loader)

In [184]:
wp_title_to_wdid

{('שבועון'): False,
 ('קונדה נאסט (קבוצת מדיה)'): False,
 ('דייוויד רמניק'): 'Q1176307',
 ('אנגלית אמריקנית'): 'Q7976',
 ('ניו יורק'): 'Q60',
 ('2017'): 'Q25290',
 ('ארצות הברית'): 'Q30',
 ('1925'): 'Q18107',
 ('אנגלית'): 'Q1860',
 ('כתב עת'): 'Q41298',
 ('ביקורת אומנות'): False,
 ('מסה (חיבור עיוני)'): 'Q35760',
 ('שירה'): 'Q482',
 ('סיפורת'): 'Q8253',
 ('קריקטורה'): 'Q482919',
 ('17 בפברואר'): 'Q2341',
 ('21 בפברואר'): 'Q2350',
 ('ניו יורק טיימס'): 'Q9684',
 ('הומור'): 'Q35874',
 ('לייף'): 'Q463198',
 ('1951'): 'Q18591',
 ('מלחמת העולם השנייה'): 'Q362',
 ('הירושימה'): 'Q34664',
 ('המאה העשרים'): False,
 ('המאה העשרים ואחת'): False,
 ('אליס מונרו'): 'Q234819',
 ('הארוקי מורקמי'): False,
 ('ולדימיר נבוקוב'): 'Q36591',
 ('פיליפ רות'): 'Q187019',
 ("ג'רום דייוויד סלינג'ר"): 'Q79904',
 ("ג'ון אפדייק"): 'Q105756',
 ("ריצ'רד ייטס"): 'Q544611',
 ("שירלי ג'קסון"): 'Q239910',
 ('סוריאליזם'): 'Q39427',
 ('אוונגליזם'): 'Q194253',
 ('תסמונת מינכהאוזן באמצעות שליח'): 'Q1414765',
 ('אברהם בורג'): '

In [22]:
for k, v in tags.items():
    if type(v) == list:
        tags[k] = {"include": v, "exclude": None, "not": None}
    else:
        tags[k].setdefault("exclude", None)
        tags[k].setdefault("not", None)

for k, v in tags.items():
    tags[k]["include"] = get_subclasses(v["include"], exclude=v["exclude"])
    tags[k]["not"] = get_subclasses(v["not"])

In [24]:
tagged_titles = {}
for title, Q_list in title_to_instance.items():
    options = []
    for Q in Q_list:
        for tag in tags:
            if Q in tags[tag]["not"]:
                break
            if Q in tags[tag]["include"]:
                options.append((Q, tag))
    tagged_titles[title] = options
pprint(tagged_titles)

tagged_titles = {k: most_popular(v) for k, v in tagged_titles.items() if len(v) > 0}

In [168]:
final_df

Unnamed: 0,untokenized_word,tag
0,הניו,O
1,יורקר,O
2,(ב,O
3,אנגלית,B-LANGUAGE
4,:,O
...,...,...
2260,של,O
2261,כתב,O
2262,הניו,O
2263,יורקר,O


In [177]:
def tokenize_df(df):
    ht = HebTokenizer()
    extra_columns = set(df.columns.tolist()
    print(extra_columns)
    df["untokenized_word"] = df["untokenized_word"].apply(ht.tokenize)
    df = df.explode("untokenized_word")
    df[["word", "token"]] = pd.DataFrame(
        df["untokenized_word"].tolist(), index=df.index
    )
    return df[["word", *extra_columns, "token"]].reset_index(drop=True)

In [181]:
pd.DataFrame.from_dict(tags)

Unnamed: 0,PERSON,NORP,FAC,ORG,GPE,LOC,EVENT,WORK_OF_ART,LAW,LANGUAGE,DATE
include,"[Q42301900, Q8031410, Q4306757, Q2662735, Q557...","[Q57413580, Q1184788, Q12347356, Q359882, Q238...","[Q6390435, Q12745000, Q73998508, Q3758835, Q25...","{Q12540664, Q3488704, Q57655560, Q18411470, Q1...","[Q182875, Q15092269, Q206824, Q282587, Q548317...","[Q9200005, Q3031290, Q1805497, Q755017, Q13516...","[Q21697714, Q60221005, Q3621221, Q1139274, Q18...","{Q65464693, Q6390435, Q62586083, Q65179965, Q6...","[Q20755435, Q712597, Q255232, Q2256985, Q20727...","[Q11002922, Q833796, Q3192221, Q7007228, Q7377...","[Q235684, Q7452865, Q15629794, Q630830, Q13004..."
exclude,,,,"[Q2097994, Q7210356]",,,,[Q14897293],,,
not,[],[],"[Q28754053, Q1360517, Q36509592, Q11347580, Q8...",[],[],[],[],"[Q20755435, Q712597, Q9638611, Q255232, Q22569...",[],[],[]


In [179]:
import re
scanner = re.Scanner([
    ('whitespace', r'\s+'),
    ('plus', r'\+'),
    ('minus', r'\-'),
    ('mult', r'\*'),
    ('div', r'/'),
    ('num', r'\d+'),
    ('paren_open', r'\('),
    ('paren_close', r'\)'),
])

for token, match in scanner.scan('(1 + 2) * 3'):
    print (token, match.group())

ValueError: not enough values to unpack (expected 2, got 0)

In [178]:
tokenize_df(final_df)

untokenized_word


ValueError: Columns must be same length as key

In [None]:
with open("tagged_titles.json", "w", encoding="utf-8") as fp:
    json.dump(tagged_titles, fp, ensure_ascii=False)

In [167]:
from datetime import datetime
now = "now"
print(f"now is {datetime.now()}")
print(now)

now is 2020-01-21 14:17:30.608723
now


In [12]:
text = " ".join([s.strip() for s in str.splitlines(wiki.strip_code()) if len(s.strip()) > 0])

In [43]:
df_list = []
for i, sent in enumerate(text.split(". ")):
    yapped = yap_it(sent + ".")
    time.sleep(3)
    df_list.append(pd_lattice(yapped["md_lattice"]))
    if i > 10:
        break 
word_and_pos = pd.concat(df_list)

In [28]:
def tag_wikilink(wl):
    tagged_list = []
    tag = tagged_titles.get(str(wl.title), "O")
    text = str(wl.text).split() if wl.text else str(wl.title).split()
    tags = [f"B-{tag}"] + [f"I-{tag}"] * len(text) if tag != "O" else [tag] * len(text)
    return zip(text, tags)

In [29]:
final = []
for i, n in enumerate(wiki.nodes):
    if not isinstance(n, mwp.nodes.template.Template):
        if isinstance(n, mwp.nodes.text.Text):
            for x in n.split():
                final.append((x, "O"))
        elif isinstance(n, mwp.nodes.wikilink.Wikilink):
            final += tag_wikilink(n)

In [162]:
final_df = pd.DataFrame(final, columns=["untokenized_word", "tag"])
final_df

Unnamed: 0,untokenized_word,tag
0,הניו,O
1,יורקר,O
2,(ב,O
3,אנגלית,B-LANGUAGE
4,:,O
...,...,...
2260,של,O
2261,כתב,O
2262,הניו,O
2263,יורקר,O


In [106]:
ht = HebTokenizer()
final_df["untokenized_word"] = final_df["untokenized_word"].apply(ht.tokenize)
final_df = final_df.explode("untokenized_word")
final_df[["token", "word"]] = pd.DataFrame(final_df["untokenized_word"].tolist(), index=final_df.index)
final_df = final_df[["word", "tag", "token"]].reset_index(drop=True)
# only_tagged_records = [[y for y in x if y[1] != "O"] for x in split_df_by_punct(final_df)]


In [163]:
("a", "b")[::-1]

('b', 'a')

In [44]:
# final_df.to_csv("wikitagged.csv", index=False, encoding="utf-8-sig")
word_and_pos.to_csv("yapped.csv", index=False, encoding="utf-8-sig")

In [63]:
ht = HebTokenizer()
tokenized_df = pd.DataFrame(ht.tokenize(text), columns=["tag", "word"])[["word", "tag"]]
split_tokenized_df = split_df_by_punct(tokenized_df)

In [151]:
def split_BIO(x, y):
    return True if (x[1][0] == "B" and y[1][0] == "B") or (x[1][0] == "I" and y[1][0] == "B") else False

In [159]:
final_final = []
for sent, tagged in zip(split_df_by_punct(tokenized_df), only_tagged_records):
    grouped = list(split_when(tagged, split_BIO))
    found = {}
    last = -1
    for w in grouped:
        for i, word in enumerate(sent):
            if w[0][0] in word[0] and i > last:
                if len(w) > 1:
                    for ii, z in enumerate(w):
                        found[i + ii] = z[1]
                else:
                    found[i] = w[0][1]
                last = i
                break
    indexed = pd.DataFrame.from_dict(found, orient="index", columns=["tag"])
    final_final.append(pd.DataFrame(sent + [(".", "PUNCT")], columns=["word", "tokenized_as"])
                         .join(indexed)
                         .fillna("O")
                      )

pd.concat(final_final).to_csv("final_final.csv", encoding="utf-8-sig")