# Top Docs for Wikipedia 50-topics models

In [109]:
import sys
import os
import pathlib
import numpy as np
import json
import pandas as pd

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

if 'src.top_docs_selection.doc_selector' in sys.modules:
    print("doc_selector already in modules")
    del sys.modules['src.top_docs_selection.doc_selector']

from src.top_docs_selection.doc_selector import DocSelector

doc_selector already in modules


In [110]:
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [111]:
def load_model_info(model_path):
    if model_path.stem == "mallet":
        thetas = np.load(model_path / "doctopics.npz.npy")
        betas = np.load(model_path / "beta.npy")
    elif model_path.stem == "ctm":
        thetas = np.load(model_path / "train.theta.npy")
        betas = np.load(model_path / "beta.npy")
    else:
        print("Model type not recognized")
        return
    
    with open(model_path / "vocab.json") as infile:
        vocab_w2id = json.load(infile)
    
    bow = np.load(model_path / "bow_mat.npy")
    
    with (model_path / "topics.txt").open('r', encoding='utf8') as fin:
        keys = [el.strip() for el in fin.readlines()]
    
    return thetas, betas, vocab_w2id, bow, keys

def create_top_docs_dataframe(method, top_docs, df, keys, thetas):
    return pd.DataFrame({
        "id_tpc": range(len(thetas.T)),
        "keys": keys,
        **{f"top doc {i+1}": [df.text.iloc[top_docs[method][k][i]] for k in range(len(thetas.T))] for i in range(5)}
    })

In [112]:
mallet_path = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/theta-evaluation/data/models/mallet")
ctm_path = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/theta-evaluation/data/models/ctm")
path_save_tops = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/theta-evaluation/data/top_docs")
corpus_path = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/theta-evaluation/data/train.metadata.jsonl")
text_column = "tokenized_text"

In [113]:
# Read documents
df = pd.read_json(corpus_path, lines=True)

df["id_top"] = range(len(df))
df["len"] = df[text_column].apply(lambda x: len(x.split()))
df["text_split"] = df[text_column].apply(lambda x: x.split())
corpus = df["text_split"].values.tolist()
df.head()

Unnamed: 0,id,text,supercategory,category,subcategory,page_name,tokenized_text,id_top,len,text_split
0,2_CicelyMaryBarker,Cicely Mary Barker = Cicely Mary Barker ( 28 June 1895 – 16 February 1973 ) was an English illus...,Language and literature,Language and literature,"Writers, publishers, and critics",Cicely Mary Barker,june february english illustrator best known series fantasy illustrations depicting flowers art ...,0,888,"[june, february, english, illustrator, best, known, series, fantasy, illustrations, depicting, f..."
1,210_GertrudeBarrowsBennett,Gertrude Barrows Bennett = Gertrude Barrows Bennett ( 1883 – 1948 ) was the first major female w...,Language and literature,Language and literature,"Writers, publishers, and critics",Gertrude Barrows Bennett,major female writer fantasy science fiction united_states publishing stories pseudonym bennett w...,1,502,"[major, female, writer, fantasy, science, fiction, united_states, publishing, stories, pseudonym..."
2,470_MeraldaWarren,Meralda Warren = Meralda Elva Junior Warren ( born 28 June 1959 ) is an artist and poet of Pitca...,Language and literature,Language and literature,"Writers, publishers, and critics",Meralda Warren,junior warren born june artist poet remote british overseas territory works english island disti...,2,412,"[junior, warren, born, june, artist, poet, remote, british, overseas, territory, works, english,..."
3,588_WilhelmBusch,Wilhelm Busch = Heinrich Christian Wilhelm Busch ( 15 April 1832 – 9 January 1908 ) was a German...,Language and literature,Language and literature,"Writers, publishers, and critics",Wilhelm Busch,christian april january german poet illustrator painter published comic illustrated tales achiev...,3,2521,"[christian, april, january, german, poet, illustrator, painter, published, comic, illustrated, t..."
4,743_GabrielGarcíaMárquez,Gabriel García Márquez = Gabriel José de la Concordia García Márquez ( / ɡɑːrˈsiːə ˈmɑːrkɛs / ; ...,Language and literature,Language and literature,"Writers, publishers, and critics",Gabriel García Márquez,american spanish march april novelist short story writer screenwriter journalist known latin_ame...,4,2241,"[american, spanish, march, april, novelist, short, story, writer, screenwriter, journalist, know..."


In [114]:
methods = ['thetas', 'thetas_sample', 'thetas_thr']#'sall', 'spart', 's3'
n_top_words_s3 = 100
n_top_docs = 5

doc_selector = DocSelector()

## Mallet

In [115]:
thetas, betas, vocab_w2id, bow, keys = load_model_info(mallet_path)

In [116]:
max_thetas = np.max(thetas)
min_thetas = np.min(thetas)
print(f"Max/min thetas: {max_thetas} / {min_thetas}")
thr = (0.5,0.9)

Max/min thetas: 0.999596118927002 / 9.460147794015938e-07


In [117]:
top_docs = {
    method: doc_selector.get_top_docs(method=method, thetas=thetas, bow=bow, betas=betas, corpus=corpus, vocab_w2id=vocab_w2id,thr=thr, top_words = None, ntop=n_top_docs, model_path=mallet_path)
    for method in methods
}

for method in top_docs.keys():
    print("##" * 50)
    print(f"-- -- METHOD: {method}")
    df_method = create_top_docs_dataframe(method, top_docs, df, keys, thetas)
    display(df_method.iloc[0:4])
    (path_save_tops / mallet_path.name).mkdir(exist_ok=True)
    this_path_save = path_save_tops / mallet_path.name / f"{method}_top_docs.xlsx"
    df_method.to_excel(this_path_save, index=False)
    print("\n")

####################################################################################################
-- -- METHOD: thetas


Unnamed: 0,id_tpc,keys,top doc 1,top doc 2,top doc 3,top doc 4,top doc 5
0,0,ship ships war german fleet guns tons british torpedo long class cruiser service italian mph aug...,Bremen-class cruiser = The Bremen class was a group of seven light cruisers built for the Imperi...,Italian battleship Emanuele Filiberto = The Emanuele Filiberto was a pre-dreadnought battleship ...,SMS Graudenz = SMS Graudenz was the lead ship of her class of light cruisers . She had one siste...,Graudenz-class cruiser = The Graudenz class of light cruisers was a class of two ships built for...,"SMS Lübeck = SMS Lübeck ( "" His Majesty 's Ship Lübeck "" ) was the fourth of seven Bremen-class ..."
1,1,british ships french ship fleet spanish island captain admiral naval port coast sea battle crew ...,Action of 15 July 1798 = The Action of 15 July 1798 was a minor naval battle of the French Revol...,Action of 7 May 1794 = The Action of 7 May 1794 was a minor naval action fought between a Britis...,Action of 10 April 1795 = The Action of 10 April 1795 was a minor naval engagement during the Fr...,Action of 21 October 1794 = The Action of 21 October 1794 was a minor naval engagement between G...,"HMS Concorde ( 1783 ) = Concorde was a 32-gun frigate of the French Navy , lead ship of her clas..."
2,2,horse horses white breed dog black breeds dogs breeding known animals century riding type bred s...,Welara = The Welara is a part-Arabian pony breed developed from the Arabian horse and the Welsh ...,Kentucky Mountain Saddle Horse = The Kentucky Mountain Saddle Horse is a horse breed from the US...,Blue Picardy Spaniel = The Blue Picardy Spaniel ( or Épagneul Bleu de Picardie ) is a breed of S...,"Breton horse = The Breton is a breed of draft horse . It was developed in Brittany , a province ...",Missouri Fox Trotter = The Missouri Fox Trotter is a horse breed from the state of Missouri in t...
3,3,match australia runs test innings england cricket team wickets scored second matches took austra...,Ron Saggers with the Australian cricket team in England in 1948 = Ron Saggers was a member of Do...,Bill Brown with the Australian cricket team in England in 1948 = Bill Brown was a member of Dona...,Colin McCool with the Australian cricket team in England in 1948 = Colin McCool was a member of ...,Don Tallon with the Australian cricket team in England in 1948 = Don Tallon was a key member of ...,1948 Ashes series = The 1948 Ashes series was that year 's edition of the long-standing cricket ...




####################################################################################################
-- -- METHOD: thetas_sample


Unnamed: 0,id_tpc,keys,top doc 1,top doc 2,top doc 3,top doc 4,top doc 5
0,0,ship ships war german fleet guns tons british torpedo long class cruiser service italian mph aug...,Jade-class aircraft carrier = The Jade class comprised a pair of passenger ships intended to be ...,SMS Kaiser ( 1874 ) = SMS Kaiser was the lead ship of the Kaiser-class ironclads ; SMS Deutschla...,Ammiraglio di Saint Bon-class battleship = The Ammiraglio di Saint Bon class was a pair of pre-d...,"SMS Preussen ( 1873 ) = For the battleship of the same name , see SMS Preussen SMS Preussen was ...",SMS Regensburg = SMS Regensburg was a light cruiser of the Graudenz class built by the German Ka...
1,1,british ships french ship fleet spanish island captain admiral naval port coast sea battle crew ...,"Bill King ( Royal Navy officer ) = Commander William Donald Aelian "" Bill "" King , DSO & Bar , D...",Action of 20 October 1793 = The Action of 20 October 1793 was a minor naval engagement of the Fr...,Yorktown campaign = The Yorktown or Virginia campaign was a series of military maneuvers and bat...,HMS Alceste ( 1806 ) = HMS Alceste was a 38-gun frigate of the Royal Navy . She was built at Roc...,"French battleship Bretagne = Bretagne was a battleship of the French Navy built in the 1910s , a..."
2,2,horse horses white breed dog black breeds dogs breeding known animals century riding type bred s...,Haley Farm State Park = Haley Farm State Park is a Connecticut state park preserving Colonial-er...,Kerry Bog Pony = The Kerry Bog Pony is a mountain and moorland breed of pony that originated in ...,"Russkiy Toy = The Russian Toy ( also known as the Russian Toy Terrier , and in Russia as the Rus...","Shetland sheep = The Shetland sheep is a small , wool-producing breed of sheep originating in th...",Bedlington Terrier = The Bedlington Terrier is a breed of small dog named after the mining town ...
3,3,match australia runs test innings england cricket team wickets scored second matches took austra...,Claire Taylor = Samantha Claire Taylor MBE ( born 25 September 1975 ) is a former cricketer who ...,Cameron White = Cameron Leon White ( born 18 August 1983 ) is an Australian cricketer for the Me...,"Sydney punchbowls = The Sydney punchbowls , made in China during Emperor Chia Ch 'ing 's reign i...",Edward Sainsbury = Edward Sainsbury ( 5 July 1851 – 28 October 1930 ) was an English cricketer w...,Doug Ring = Douglas Thomas Ring ( 14 October 1918 – 23 June 2003 ) was an Australian cricketer w...




####################################################################################################
-- -- METHOD: thetas_thr


Unnamed: 0,id_tpc,keys,top doc 1,top doc 2,top doc 3,top doc 4,top doc 5
0,0,ship ships war german fleet guns tons british torpedo long class cruiser service italian mph aug...,Italian ironclad Sardegna = Sardegna was the third of three Re Umberto-class ironclad battleship...,Italian battleship Napoli = Napoli was a Regina Elena-class pre-dreadnought battleship built for...,SM UB-3 = SM UB-3 was a German Type UB I submarine or U-boat in the German Imperial Navy ( Germa...,Italian cruiser Coatit = Coatit was a torpedo cruiser of the Italian Regia Marina built in the l...,German submarine U-42 ( 1939 ) = German submarine U-42 was a Type IXA U-boat of Nazi Germany 's ...
1,1,british ships french ship fleet spanish island captain admiral naval port coast sea battle crew ...,Action of 27 June 1798 = The Action of 27 June 1798 was a minor naval engagement between British...,Sunda Strait campaign of January 1794 = The Sunda Strait campaign of January 1794 was a series o...,Battle of Tellicherry = The Battle of Tellicherry was a naval action fought off the Indian port ...,Action of 18 August 1798 = The Action of 18 August 1798 was a minor naval engagement of the Fren...,Action of 16 October 1799 = The Action of 16 October 1799 was a minor naval engagement during th...
2,2,horse horses white breed dog black breeds dogs breeding known animals century riding type bred s...,Parson Russell Terrier = The Parson Russell Terrier is a breed of small white terrier that was t...,Shire horse = The Shire horse is a breed of draught horse ( BrE ) or draft horse ( AmE ) . The b...,"Dølehest = The Dole Gudbrandsdal , Dølahest or Dole is a draft- and harness-type horse from Norw...",American Cocker Spaniel = The American Cocker Spaniel is a breed of sporting dog . It is a spani...,Colorado Ranger = The Colorado Ranger is a horse breed from the Colorado High Plains in the Unit...
3,3,match australia runs test innings england cricket team wickets scored second matches took austra...,Edward Sainsbury = Edward Sainsbury ( 5 July 1851 – 28 October 1930 ) was an English cricketer w...,"Mick Harvey ( umpire ) = Clarence Edgar ( Mick ) Harvey , ( born 17 March 1921 at Newcastle , Ne...",Bill Johnston ( cricketer ) = William Arras Johnston ( 26 February 1922 – 25 May 2007 ) was an A...,Claire Taylor = Samantha Claire Taylor MBE ( born 25 September 1975 ) is a former cricketer who ...,"Bob Willis = Robert George Dylan Willis MBE ( born Robert George Willis on 30 May 1949 ) , known..."






## CTM

In [118]:
thetas, betas, vocab_w2id, bow, keys = load_model_info(ctm_path)

In [119]:
max_thetas = np.max(thetas)
min_thetas = np.min(thetas)
print(f"Max/min thetas: {max_thetas} / {min_thetas}")
thr = (0.5,0.9)

Max/min thetas: 0.9954009354114532 / 1.8919544385198605e-05


In [120]:
top_docs = {
    method: doc_selector.get_top_docs(method=method, thetas=thetas, bow=bow, betas=betas, corpus=corpus, vocab_w2id=vocab_w2id,thr=thr, top_words = n_top_words_s3, ntop=n_top_docs, model_path=ctm_path)
    for method in methods
}

for method in top_docs.keys():
    print("##" * 50)
    print(f"-- -- METHOD: {method}")
    df_method = create_top_docs_dataframe(method, top_docs, df, keys, thetas)
    display(df_method.iloc[0:4])
    (path_save_tops / ctm_path.name).mkdir(exist_ok=True)
    this_path_save = path_save_tops / ctm_path.name / f"{method}_top_docs.xlsx"
    df_method.to_excel(this_path_save, index=False)
    print("\n")

####################################################################################################
-- -- METHOD: thetas


Unnamed: 0,id_tpc,keys,top doc 1,top doc 2,top doc 3,top doc 4,top doc 5
0,0,credits composition allmusic listing charts piano drums beats liner label recording download bal...,"Oh Blue Christmas = Oh Blue Christmas is an EP by the American band A Fine Frenzy , released in ...",The Stripped Mixes = The Stripped Mixes ( released on CD for a limited time as The Motown 50 Mix...,"A Different Kind of Love Song = "" A Different Kind of Love Song "" is a song by American recordin...","Electric ( Robyn song ) = "" Electric "" is a song by Swedish recording artist Robyn from her seco...","No Quiero Saber = "" No Quiero Saber "" is a song recorded by American recording artist Selena . I..."
1,1,film release released million story scene scenes script effects production opening plot filming ...,"Batman in film = The fictional character Batman , a comic book superhero featured in DC Comics p...","Toy Story 3 = Toy Story 3 is a 2010 American 3D computer-animated comedy-drama film , and the th...",Batman ( 1989 film ) = Batman is a 1989 American superhero film directed by Tim Burton and produ...,Toy Story 2 = Toy Story 2 is a 1999 American computer-animated comedy adventure film produced by...,Toy Story = Toy Story is a 1995 American computer-animated adventure buddy comedy film produced ...
2,2,career hit games season league baseball major_league_baseball signed home played manager profess...,"Jon Lieber = Jonathan Ray Lieber ( born April 2 , 1970 ) is a former Major League Baseball ( MLB...","John Bowker ( baseball ) = John Brite Bowker ( born July 8 , 1983 ) is an American professional ...","Will Venable = William Dion Venable ( born October 29 , 1982 ) is an American professional baseb...","Eric Hacker = Eric Lynn Hacker ( born March 26 , 1983 ) is an American professional baseball pit...","Denny Bautista = Denny M. Bautista Germán ( born August 23 , 1980 ) is a Dominican professional ..."
3,3,work life wrote writing published book written women works history world writer woman early book...,"Feminism = Feminism is a range of political movements , ideologies , and social movements that s...",New Age = The New Age is a term applied to a range of spiritual or religious beliefs and practic...,"Objections to evolution = Scholars , theologians and lay-people have raised objections to evolut...",Alfred North Whitehead = Alfred North Whitehead OM FRS ( 15 February 1861 – 30 December 1947 ) w...,"Benjamin Lee Whorf = Benjamin Lee Whorf ( / wɔːrf / ; April 24 , 1897 – July 26 , 1941 ) was an ..."




####################################################################################################
-- -- METHOD: thetas_sample


Unnamed: 0,id_tpc,keys,top doc 1,top doc 2,top doc 3,top doc 4,top doc 5
0,0,credits composition allmusic listing charts piano drums beats liner label recording download bal...,"F. Emasculata = "" F. Emasculata "" is the twenty-second episode of the second season of the Ameri...","The One I Love ( manga ) = The One I Love ( Japanese : わたしのすきなひと , Hepburn : Watashi no Sukinahi...","American Dream ( Casting Crowns song ) = "" American Dream "" is a song recorded by Christian rock...","Te Quiero ( Ricardo Arjona song ) = "" Te Quiero "" is a latin pop song by Guatemalan recording ar...","Somebody to Love ( 30 Rock ) = "" Somebody to Love "" is the sixth episode of the second season of..."
1,1,film release released million story scene scenes script effects production opening plot filming ...,"Halifax Central Library = The Halifax Central Library is a public library in Halifax , Nova Scot...",Arkham Asylum : A Serious House on Serious Earth = Arkham Asylum : A Serious House on Serious Ea...,"Your and My Secret = Your and My Secret , known in Japan as lit . "" My and Her Three X 's "" ( Ja...","New Cutie Honey = New Cutie Honey ( Japanese : 新 ・ キューティーハニー , Hepburn : Shin Kyūtī Hanī , also ...","Equine nutrition = Equine nutrition is the feeding of horses , ponies , mules , donkeys , and ot..."
2,2,career hit games season league baseball major_league_baseball signed home played manager profess...,Washington State Route 516 = State Route 516 ( SR 516 ) is a 16.49-mile-long ( 26.54 km ) state ...,"Ice Box Chamberlain = Elton P. "" Ice Box "" Chamberlain ( November 5 , 1867 – September 22 , 1929...",Minnesota Golden Gophers men 's basketball = The Minnesota Golden Gophers men 's basketball team...,French battleship Vergniaud = Vergniaud was one of the six Danton class semi-dreadnought battles...,Moondance = Moondance is the third studio album by Northern Irish singer-songwriter Van Morrison...
3,3,work life wrote writing published book written women works history world writer woman early book...,Parliament Act 1911 = The Parliament Act 1911 is an Act of the Parliament of the United Kingdom ...,"Dominica at the 2008 Summer Olympics = Dominica sent a delegation of eight people , including tw...",12 Days = 12 Days is a debut graphic novel / global manga written and illustrated by June Kim . ...,"Third Epistle of John = The Third Epistle of John , often referred to as Third John and written ...","Delaware Route 58 = Delaware Route 58 ( DE 58 ) , also known as Churchmans Road , is a state hig..."




####################################################################################################
-- -- METHOD: thetas_thr


Unnamed: 0,id_tpc,keys,top doc 1,top doc 2,top doc 3,top doc 4,top doc 5
0,0,credits composition allmusic listing charts piano drums beats liner label recording download bal...,Cicely Mary Barker = Cicely Mary Barker ( 28 June 1895 – 16 February 1973 ) was an English illus...,Field hockey pitch = A hockey pitch is the playing surface for the game of field hockey . Histor...,Transport in the Soviet Union = Transport in the Union of Soviet Socialist Republics ( USSR ) wa...,Amy Wyatt = Amy Marie Violet Wyatt is a fictional character from the British soap opera Emmerdal...,Ivan Jones ( Emmerdale ) = Ivan Jones is a fictional character in the British soap opera Emmerda...
1,1,film release released million story scene scenes script effects production opening plot filming ...,"Batman in film = The fictional character Batman , a comic book superhero featured in DC Comics p...","Jack Fingleton = John "" Jack "" Henry Webb Fingleton OBE ( 28 April 1908 – 22 November 1981 ) was...",Delhi Daredevils in 2012 = The Delhi Daredevils ( DD ) is a franchise cricket team based in Delh...,"Curtly Ambrose = Sir Curtly Elconn Lynwall Ambrose , KCN ( born 21 September 1963 ) is a former ...",Irfan Pathan = Irfan Khan Pathan ( pronunciation ; born 27 October 1984 ) is an Indian cricketer...
2,2,career hit games season league baseball major_league_baseball signed home played manager profess...,"Jon Lieber = Jonathan Ray Lieber ( born April 2 , 1970 ) is a former Major League Baseball ( MLB...","John Bowker ( baseball ) = John Brite Bowker ( born July 8 , 1983 ) is an American professional ...","Will Venable = William Dion Venable ( born October 29 , 1982 ) is an American professional baseb...","Eric Hacker = Eric Lynn Hacker ( born March 26 , 1983 ) is an American professional baseball pit...","Denny Bautista = Denny M. Bautista Germán ( born August 23 , 1980 ) is a Dominican professional ..."
3,3,work life wrote writing published book written women works history world writer woman early book...,Cicely Mary Barker = Cicely Mary Barker ( 28 June 1895 – 16 February 1973 ) was an English illus...,Field hockey pitch = A hockey pitch is the playing surface for the game of field hockey . Histor...,Transport in the Soviet Union = Transport in the Union of Soviet Socialist Republics ( USSR ) wa...,Amy Wyatt = Amy Marie Violet Wyatt is a fictional character from the British soap opera Emmerdal...,Ivan Jones ( Emmerdale ) = Ivan Jones is a fictional character in the British soap opera Emmerda...




