In [2]:
import pandas as pd
import numpy as np

# scikit-learn utilities
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# scikit-learn models
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression

In [151]:
lgb_table = pd.read_csv("../data/lgb_table_ids.csv")
lgb_pages = pd.read_csv("../data/lgb_pages_v2.csv")

In [158]:
lgb_pages_nodups = lgb_pages.drop_duplicates()

In [17]:
# lgb_table_nodups = lgb_table.loc[:, ['Title', 'Notes']].drop_duplicates()

In [159]:
lgb_merged = pd.merge(lgb_table, lgb_pages_nodups, how='inner', on='pageid')

In [97]:
# lgb_merged = pd.merge(lgb_table, lgb_pages, how='inner', left_on='Title', right_on='title')

In [162]:
lgb_merged.loc[lgb_merged.Notes == 'G.', 'Notes'] = 'G'

In [260]:
# relabel both Kray twins as B, not B & G
# Ronnie Kray describes himself as both homosexual and bisexual 
# according to his wiki entry
lgb_merged.loc[lgb_merged.title == 'Kray twins', 'Notes'] = 'B'

In [261]:
lgb_merged_nodups = lgb_merged.loc[:, ['Notes', 'title', 'pageid', 'url', 'summary', 'content']].drop_duplicates()

In [281]:
lgb_merged_nodups.reset_index(drop=True, inplace=True)

In [101]:
svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()), 
                    ('svm', LinearSVC(loss='hinge'))])

In [282]:
X = lgb_merged_nodups['content']
y = lgb_merged_nodups['Notes']

In [283]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, 
                                                        train_size=0.75, test_size=0.25)
svm_clf.fit(X_train, y_train)
y_svm = svm_clf.predict(X_test)

In [284]:
print(classification_report(y_test, y_svm))

             precision    recall  f1-score   support

          B       1.00      0.03      0.07        86
          G       0.90      0.98      0.94       474
          L       0.76      0.94      0.84       161

avg / total       0.88      0.86      0.81       721



In [265]:
accuracy_score(y_test, y_svm)

0.86130374479889038

In [167]:
lgb_merged.Notes.describe()

count     2896
unique       3
top          G
freq      1892
Name: Notes, dtype: object

In [168]:
lgb_merged.groupby('Notes')['Name'].count()

Notes
B     342
G    1892
L     662
Name: Name, dtype: int64

In [239]:
lgb_merged.loc[:, ['Notes', 'pageid']].drop_duplicates().groupby('Notes')['pageid'].count()

Notes
B     340
G    1884
L     659
Name: pageid, dtype: int64

In [169]:
lgb_merged.shape

(2896, 11)

In [170]:
from sklearn.metrics import confusion_matrix

In [266]:
print(confusion_matrix(y_test, y_svm))

[[  3  42  41]
 [  0 466   8]
 [  0   9 152]]


In [267]:
lr_clf = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()), 
                   ('lr', LogisticRegression())])

In [268]:
lr_clf.fit(X_train, y_train)
y_lr = lr_clf.predict(X_test)

In [269]:
print(classification_report(y_test, y_lr))

             precision    recall  f1-score   support

          B       1.00      0.02      0.05        86
          G       0.89      0.99      0.93       474
          L       0.75      0.89      0.81       161

avg / total       0.87      0.85      0.80       721



In [270]:
accuracy_score(y_test, y_lr)

0.85020804438280162

In [271]:
print(confusion_matrix(y_test, y_lr))

[[  2  42  42]
 [  0 467   7]
 [  0  17 144]]


In [272]:
lgb_merged.iloc[y_test.index].loc[lgb_merged.Notes == 'B']

Unnamed: 0,Name,Title,Lifetime,Nationality,Notable as,Notes,pageid,title,url,summary,content
1137,Devin K. Grayson,Devin K. Grayson,b. ?,American,Comic book writer and novelist,B,2.01106e+06,Devin K. Grayson,https://en.wikipedia.org/wiki/Devin_K._Grayson,Devin Kalile Grayson is an American writer of ...,Devin Kalile Grayson is an American writer of ...
953,Gustave Flaubert,Gustave Flaubert,1821–1880,French,Author,B,12307,Gustave Flaubert,https://en.wikipedia.org/wiki/Gustave_Flaubert,Gustave Flaubert (French: [ɡystav flobɛʁ]; 12 ...,Gustave Flaubert (French: [ɡystav flobɛʁ]; 12 ...
822,Andy Dick,Andy Dick,b. 1965,American,"Actor, comedian",B,641959,Andy Dick,https://en.wikipedia.org/wiki/Andy_Dick,"Andrew Roane Dick (born December 21, 1965) is ...","Andrew Roane Dick (born December 21, 1965) is ..."
1280,Aaron Henry,Aaron Henry,1922–1977,American,"Civil rights leader, politician",B,5.50846e+06,Aaron Henry,https://en.wikipedia.org/wiki/Aaron_Henry,"Aaron Henry (July 2, 1922 – May 19, 1997) was ...","Aaron Henry (July 2, 1922 – May 19, 1997) was ..."
2596,Rebecca Sugar,Rebecca Sugar,b. 1987,American,"Animator, composer, director, show creator",B,3.91777e+07,Rebecca Sugar,https://en.wikipedia.org/wiki/Rebecca_Sugar,"Rebecca Sugar (born July 9, 1987) is an Americ...","Rebecca Sugar (born July 9, 1987) is an Americ..."
2729,Yona Wallach,Yona Wallach,1944–1985,Israeli,Poet,B,2.94592e+06,Yona Wallach,https://en.wikipedia.org/wiki/Yona_Wallach,"Yona Wallach (Hebrew: יונה וולך‎; June 10, 194...","Yona Wallach (Hebrew: יונה וולך‎; June 10, 194..."
745,Tom Daley,Tom Daley,b. 1994,British,Olympic diver,B,9.4735e+06,Tom Daley,https://en.wikipedia.org/wiki/Tom_Daley,Thomas Robert Daley (born 21 May 1994) is a Br...,Thomas Robert Daley (born 21 May 1994) is a Br...
198,Tallulah Bankhead,Tallulah Bankhead,1902–1968,American,Actress,B,229818,Tallulah Bankhead,https://en.wikipedia.org/wiki/Tallulah_Bankhead,"Tallulah Brockman Bankhead (January 31, 1902 –...","Tallulah Brockman Bankhead (January 31, 1902 –..."
437,Romaine Brooks,Romaine Brooks,1874–1970,American,Painter,B,1.60105e+06,Romaine Brooks,https://en.wikipedia.org/wiki/Romaine_Brooks,"Romaine Brooks, born Beatrice Romaine Goddard ...","Romaine Brooks, born Beatrice Romaine Goddard ..."
2892,Aileen Wuornos,Aileen Wuornos,1956–2002,American,Serial killer,B,214928,Aileen Wuornos,https://en.wikipedia.org/wiki/Aileen_Wuornos,Aileen Carol Wuornos Pralle (born Aileen Carol...,Aileen Carol Wuornos Pralle (born Aileen Carol...


In [273]:
test_df = pd.DataFrame({'predicted': y_svm, 'actual': y_test}, index=y_test.index)

In [274]:
test_df_merged = test_df.merge(lgb_merged, left_index=True, right_index=True)

In [275]:
test_df_merged.loc[test_df_merged.actual == 'B']

Unnamed: 0,actual,predicted,Name,Title,Lifetime,Nationality,Notable as,Notes,pageid,title,url,summary,content
1137,B,L,Devin K. Grayson,Devin K. Grayson,b. ?,American,Comic book writer and novelist,B,2.01106e+06,Devin K. Grayson,https://en.wikipedia.org/wiki/Devin_K._Grayson,Devin Kalile Grayson is an American writer of ...,Devin Kalile Grayson is an American writer of ...
953,B,G,Gustave Flaubert,Gustave Flaubert,1821–1880,French,Author,B,12307,Gustave Flaubert,https://en.wikipedia.org/wiki/Gustave_Flaubert,Gustave Flaubert (French: [ɡystav flobɛʁ]; 12 ...,Gustave Flaubert (French: [ɡystav flobɛʁ]; 12 ...
822,B,G,Andy Dick,Andy Dick,b. 1965,American,"Actor, comedian",B,641959,Andy Dick,https://en.wikipedia.org/wiki/Andy_Dick,"Andrew Roane Dick (born December 21, 1965) is ...","Andrew Roane Dick (born December 21, 1965) is ..."
1280,B,G,Aaron Henry,Aaron Henry,1922–1977,American,"Civil rights leader, politician",B,5.50846e+06,Aaron Henry,https://en.wikipedia.org/wiki/Aaron_Henry,"Aaron Henry (July 2, 1922 – May 19, 1997) was ...","Aaron Henry (July 2, 1922 – May 19, 1997) was ..."
2596,B,L,Rebecca Sugar,Rebecca Sugar,b. 1987,American,"Animator, composer, director, show creator",B,3.91777e+07,Rebecca Sugar,https://en.wikipedia.org/wiki/Rebecca_Sugar,"Rebecca Sugar (born July 9, 1987) is an Americ...","Rebecca Sugar (born July 9, 1987) is an Americ..."
2729,B,L,Yona Wallach,Yona Wallach,1944–1985,Israeli,Poet,B,2.94592e+06,Yona Wallach,https://en.wikipedia.org/wiki/Yona_Wallach,"Yona Wallach (Hebrew: יונה וולך‎; June 10, 194...","Yona Wallach (Hebrew: יונה וולך‎; June 10, 194..."
745,B,G,Tom Daley,Tom Daley,b. 1994,British,Olympic diver,B,9.4735e+06,Tom Daley,https://en.wikipedia.org/wiki/Tom_Daley,Thomas Robert Daley (born 21 May 1994) is a Br...,Thomas Robert Daley (born 21 May 1994) is a Br...
198,B,L,Tallulah Bankhead,Tallulah Bankhead,1902–1968,American,Actress,B,229818,Tallulah Bankhead,https://en.wikipedia.org/wiki/Tallulah_Bankhead,"Tallulah Brockman Bankhead (January 31, 1902 –...","Tallulah Brockman Bankhead (January 31, 1902 –..."
437,B,L,Romaine Brooks,Romaine Brooks,1874–1970,American,Painter,B,1.60105e+06,Romaine Brooks,https://en.wikipedia.org/wiki/Romaine_Brooks,"Romaine Brooks, born Beatrice Romaine Goddard ...","Romaine Brooks, born Beatrice Romaine Goddard ..."
2892,B,L,Aileen Wuornos,Aileen Wuornos,1956–2002,American,Serial killer,B,214928,Aileen Wuornos,https://en.wikipedia.org/wiki/Aileen_Wuornos,Aileen Carol Wuornos Pralle (born Aileen Carol...,Aileen Carol Wuornos Pralle (born Aileen Carol...


In [276]:
test_df_merged.query("actual == 'B' & predicted == 'B'")

Unnamed: 0,actual,predicted,Name,Title,Lifetime,Nationality,Notable as,Notes,pageid,title,url,summary,content
2455,B,B,Judee Sill,Judee Sill,1944–1979,American,Singer and songwriter,B,1847120.0,Judee Sill,https://en.wikipedia.org/wiki/Judee_Sill,"Judee Sill (born Judith Lynne Sill, October 7,...","Judee Sill (born Judith Lynne Sill, October 7,..."
2384,B,B,Avy Scott,Avy Scott,b. 1981,American,Porn star,B,1871540.0,Avy Scott,https://en.wikipedia.org/wiki/Avy_Scott,"Avy Scott (born in November 2, 1981) is the st...","Avy Scott (born in November 2, 1981) is the st..."
443,B,B,Erin Brown,Erin Brown,b. 1979,English,"Actor, model, filmmaker",B,6207590.0,Erin Brown,https://en.wikipedia.org/wiki/Erin_Brown,"Erin Brown (born Erin DeWright; October 16, 19...","Erin Brown (born Erin DeWright; October 16, 19..."


In [277]:
test_df_merged.query("actual == 'B' & predicted == 'G'")

Unnamed: 0,actual,predicted,Name,Title,Lifetime,Nationality,Notable as,Notes,pageid,title,url,summary,content
953,B,G,Gustave Flaubert,Gustave Flaubert,1821–1880,French,Author,B,12307.0,Gustave Flaubert,https://en.wikipedia.org/wiki/Gustave_Flaubert,Gustave Flaubert (French: [ɡystav flobɛʁ]; 12 ...,Gustave Flaubert (French: [ɡystav flobɛʁ]; 12 ...
822,B,G,Andy Dick,Andy Dick,b. 1965,American,"Actor, comedian",B,641959.0,Andy Dick,https://en.wikipedia.org/wiki/Andy_Dick,"Andrew Roane Dick (born December 21, 1965) is ...","Andrew Roane Dick (born December 21, 1965) is ..."
1280,B,G,Aaron Henry,Aaron Henry,1922–1977,American,"Civil rights leader, politician",B,5508460.0,Aaron Henry,https://en.wikipedia.org/wiki/Aaron_Henry,"Aaron Henry (July 2, 1922 – May 19, 1997) was ...","Aaron Henry (July 2, 1922 – May 19, 1997) was ..."
745,B,G,Tom Daley,Tom Daley,b. 1994,British,Olympic diver,B,9473500.0,Tom Daley,https://en.wikipedia.org/wiki/Tom_Daley,Thomas Robert Daley (born 21 May 1994) is a Br...,Thomas Robert Daley (born 21 May 1994) is a Br...
769,B,G,Ron Davies,Ron Davies (British politician),b. 1946,Welsh,Politician,B,359999.0,Ron Davies (Welsh politician),https://en.wikipedia.org/wiki/Ron_Davies_(Wels...,Ronald Davies (born 6 August 1946) is a Welsh ...,Ronald Davies (born 6 August 1946) is a Welsh ...
1757,B,G,Logan McCree,Logan McCree,b. 1977,German,Pornographic Actor,B,26133600.0,Logan McCree,https://en.wikipedia.org/wiki/Logan_McCree,Logan McCree (born Philipp Tanzer on December ...,Logan McCree (born Philipp Tanzer on December ...
2660,B,G,Tiberius,Tiberius,42 BC – 37 AD,Roman,Emperor,B,30536.0,Tiberius,https://en.wikipedia.org/wiki/Tiberius,Tiberius (Latin: Tiberius Caesar Dīvī Augustī ...,Tiberius (Latin: Tiberius Caesar Dīvī Augustī ...
467,B,G,Horst Buchholz,Horst Buchholz,1933–2003,German,Actor,B,192115.0,Horst Buchholz,https://en.wikipedia.org/wiki/Horst_Buchholz,"Horst Werner Buchholz (December 4, 1933 – Marc...","Horst Werner Buchholz (December 4, 1933 – Marc..."
434,B,G,Rupert Brooke,Rupert Brooke,1887–1915,English,Poet,B,62355.0,Rupert Brooke,https://en.wikipedia.org/wiki/Rupert_Brooke,Rupert Chawner Brooke (middle name sometimes g...,Rupert Chawner Brooke (middle name sometimes g...
1072,B,G,Thea Gill,Thea Gill,b. 1970,Canadian,Actor,B,1122020.0,Thea Gill,https://en.wikipedia.org/wiki/Thea_Gill,"Thea Louise Gill (; born April 5, 1970) is a C...","Thea Louise Gill (; born April 5, 1970) is a C..."


In [278]:
test_df_merged.query("actual == 'B' & predicted == 'L'")

Unnamed: 0,actual,predicted,Name,Title,Lifetime,Nationality,Notable as,Notes,pageid,title,url,summary,content
1137,B,L,Devin K. Grayson,Devin K. Grayson,b. ?,American,Comic book writer and novelist,B,2011060.0,Devin K. Grayson,https://en.wikipedia.org/wiki/Devin_K._Grayson,Devin Kalile Grayson is an American writer of ...,Devin Kalile Grayson is an American writer of ...
2596,B,L,Rebecca Sugar,Rebecca Sugar,b. 1987,American,"Animator, composer, director, show creator",B,39177700.0,Rebecca Sugar,https://en.wikipedia.org/wiki/Rebecca_Sugar,"Rebecca Sugar (born July 9, 1987) is an Americ...","Rebecca Sugar (born July 9, 1987) is an Americ..."
2729,B,L,Yona Wallach,Yona Wallach,1944–1985,Israeli,Poet,B,2945920.0,Yona Wallach,https://en.wikipedia.org/wiki/Yona_Wallach,"Yona Wallach (Hebrew: יונה וולך‎; June 10, 194...","Yona Wallach (Hebrew: יונה וולך‎; June 10, 194..."
198,B,L,Tallulah Bankhead,Tallulah Bankhead,1902–1968,American,Actress,B,229818.0,Tallulah Bankhead,https://en.wikipedia.org/wiki/Tallulah_Bankhead,"Tallulah Brockman Bankhead (January 31, 1902 –...","Tallulah Brockman Bankhead (January 31, 1902 –..."
437,B,L,Romaine Brooks,Romaine Brooks,1874–1970,American,Painter,B,1601050.0,Romaine Brooks,https://en.wikipedia.org/wiki/Romaine_Brooks,"Romaine Brooks, born Beatrice Romaine Goddard ...","Romaine Brooks, born Beatrice Romaine Goddard ..."
2892,B,L,Aileen Wuornos,Aileen Wuornos,1956–2002,American,Serial killer,B,214928.0,Aileen Wuornos,https://en.wikipedia.org/wiki/Aileen_Wuornos,Aileen Carol Wuornos Pralle (born Aileen Carol...,Aileen Carol Wuornos Pralle (born Aileen Carol...
2752,B,L,Ethel Waters,Ethel Waters,1896–1977,American,"Jazz/pop musician, actor",B,171252.0,Ethel Waters,https://en.wikipedia.org/wiki/Ethel_Waters,"Ethel Waters (October 31, 1896 – September 1, ...","Ethel Waters (October 31, 1896 – September 1, ..."
302,B,L,Kajsa Bergqvist,Kajsa Bergqvist,b. 1976,Swedish,High Jumper,B,2260140.0,Kajsa Bergqvist,https://en.wikipedia.org/wiki/Kajsa_Bergqvist,Kajsa Margareta Bergqvist (Swedish pronunciati...,Kajsa Margareta Bergqvist (Swedish pronunciati...
683,B,L,Patricia Cornwell,Patricia Cornwell,b. 1956,American,Author,B,170947.0,Patricia Cornwell,https://en.wikipedia.org/wiki/Patricia_Cornwell,Patricia Cornwell (born Patricia Carroll Danie...,Patricia Cornwell (born Patricia Carroll Danie...
213,B,L,Djuna Barnes,Djuna Barnes,1892–1982,American,Writer,B,377702.0,Djuna Barnes,https://en.wikipedia.org/wiki/Djuna_Barnes,"Djuna Barnes (June 12, 1892 – June 18, 1982) w...","Djuna Barnes (June 12, 1892 – June 18, 1982) w..."


In [279]:
y_test

475     G
1465    G
2393    G
1614    G
1102    G
1265    G
1137    B
2268    G
953     B
1813    L
2790    G
2777    L
372     G
1773    L
822     B
785     G
502     G
242     G
335     G
843     G
960     L
1231    G
1280    B
2391    G
1791    L
1530    L
2015    G
1018    G
1765    G
1927    G
       ..
1238    B
1697    L
1885    L
283     L
845     G
1079    L
1492    L
1522    L
1953    G
787     G
600     G
2283    G
2866    L
2625    G
444     B
512     L
76      G
1756    G
653     G
526     G
2354    G
864     L
2609    G
861     G
1425    L
1098    L
1525    L
634     L
2523    L
1397    G
Name: Notes, Length: 721, dtype: object

## Notable as...

what if I tokenize this field and use it to create groups

ideal: filter stopwords, stem?, turn in to tf-idf matrix [nouns only?]

would be a good way to label t-SNE plot, mainly

people are often notable for more than one thing... could arbitrarily pick the first one

[that's hard too]

In [186]:
import gensim

In [188]:
notable = [gensim.utils.simple_preprocess(n) for n in lgb_merged['Notable as']]

In [206]:
len([True for n in notable if 'musician' in n or 'singer' in n])

352

In [204]:
len([True for n in notable if 'actor' in n or 'actress' in n])

421

In [201]:
len([True for n in notable if 'politician' in n])

343

In [202]:
len([True for n in notable if 'activist' in n])

171

In [224]:
len([True for n in notable if 'writer' in n or 'poet' in n or 'author' in n])

626

In [211]:
len([True for n in notable if 'model' in n])

29

In [213]:
len([True for n in notable if 'comedian' in n])

71

In [222]:
len([True for n in notable if 'academic' in n or 'scientist' in n])

18

In [221]:
len([True for n in notable if 'athlete' in n or 'skater' in n])

31

In [223]:
len([True for n in notable if 'artist' in n])

91

In [228]:
import nltk
stops = nltk.corpus.stopwords.words('english')

In [230]:
notable2 = [nn for n in notable for nn in n if nn not in stops]

In [232]:
from collections import Counter 
counter = Counter(notable2)

In [234]:
counter.most_common(n=20)

[('writer', 432),
 ('actor', 381),
 ('politician', 343),
 ('musician', 247),
 ('activist', 171),
 ('director', 137),
 ('singer', 111),
 ('author', 110),
 ('poet', 103),
 ('artist', 91),
 ('film', 91),
 ('journalist', 82),
 ('television', 80),
 ('rights', 79),
 ('rock', 79),
 ('composer', 78),
 ('lgbt', 77),
 ('playwright', 75),
 ('pop', 71),
 ('comedian', 71)]

In [236]:
counter.most_common()

[('writer', 432),
 ('actor', 381),
 ('politician', 343),
 ('musician', 247),
 ('activist', 171),
 ('director', 137),
 ('singer', 111),
 ('author', 110),
 ('poet', 103),
 ('artist', 91),
 ('film', 91),
 ('journalist', 82),
 ('television', 80),
 ('rights', 79),
 ('rock', 79),
 ('composer', 78),
 ('lgbt', 77),
 ('playwright', 75),
 ('pop', 71),
 ('comedian', 71),
 ('player', 60),
 ('producer', 59),
 ('screenwriter', 51),
 ('filmmaker', 51),
 ('designer', 49),
 ('classical', 49),
 ('personality', 45),
 ('songwriter', 44),
 ('actress', 41),
 ('tv', 34),
 ('gay', 33),
 ('fashion', 33),
 ('photographer', 32),
 ('presenter', 30),
 ('novelist', 29),
 ('model', 29),
 ('dancer', 28),
 ('porn', 26),
 ('radio', 25),
 ('painter', 23),
 ('first', 22),
 ('star', 22),
 ('historian', 21),
 ('show', 20),
 ('host', 20),
 ('choreographer', 19),
 ('skater', 19),
 ('critic', 18),
 ('performer', 18),
 ('reality', 18),
 ('olympic', 17),
 ('lawyer', 17),
 ('figure', 17),
 ('music', 16),
 ('killer', 16),
 ('jazz

## Nationality

In [251]:
lgb_merged.groupby('Nationality')['Name'].count().sort_values(ascending=False).head(20)

Nationality
American         1445
English           293
Canadian          241
German            105
British            92
Australian         86
French             70
Swedish            40
Scottish           29
Irish              26
Dutch              25
Italian            22
Norwegian          19
New Zealander      18
Russian            17
Mexican            15
Belgian            14
Finnish            12
South African      12
Spanish            12
Name: Name, dtype: int64