In [117]:
small_network = {
    "nodes": [
        {"id": "Albert Einstein"},
        {"id": "Paul Dirac"},
        {"id": "Niels Bohr"}
    ],
    "links": [
        {"source": "Albert Einstein", "target": "Paul Dirac"},
        {"source": "Albert Einstein", "target": "Niels Bohr"},
        {"source": "Paul Dirac", "target": "Niels Bohr"}
    ]
}

## We dump this network into a .json file
import json
with open("./data/small_network.json","w") as f:
    json.dump(small_network, f, indent=4)

In [118]:
import os
os.system("open -a /Applications/Safari.app ./small_network.html")

0

In [72]:
## We get the nobel data set
import numpy as np
import pandas as pd
from httplib2 import Http
from bs4 import BeautifulSoup, SoupStrainer

class Parser:
    
    def __init__(self, url):  
        http = Http()
        status, response = http.request(url)
        tables = BeautifulSoup(response, "lxml", 
                              parse_only=SoupStrainer("table", {"class":"wikitable sortable"}))
        self.table = tables.contents[1]
    
    def parse_table(self):      
        rows = self.table.find_all("tr")
        header = self.parse_header(rows[0])
        table_array = [self.parse_row(row) for row in rows[1:]]
        table_df = pd.DataFrame(table_array, columns=header).apply(self.clean_table, 1)
        return table_df.replace({"Year":{'':np.nan}})
        
    def parse_row(self, row):     
        columns = row.find_all("td")
        return [BeautifulSoup.get_text(col).strip() for col in columns if BeautifulSoup.get_text(col) != '']
    
    def parse_header(self, row):     
        columns = row.find_all("th")
        return [BeautifulSoup.get_text(col).strip() for col in columns if BeautifulSoup.get_text(col) != ""]
    
    def clean_table(self, row):
        if not row.iloc[0].isdigit() and row.iloc[0] != '':
            return row.shift(1)
        else:
            return row
        
url = "https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics"        
parser = Parser(url)   
nobel_df = parser.parse_table()
nobel_df.columns = ["Year", "Laureate", "Country", "Rationale"]
nobel_df.dropna(subset=["Country"], inplace=True)
nobel_df.fillna(method="ffill", inplace=True)
nobel_df.drop(["Year", "Country", "Rationale"], 1, inplace=True)

http = Http()
status, response = http.request(url)

table = BeautifulSoup(response, "lxml", parse_only=SoupStrainer('table'))
link_df = pd.DataFrame([[x.string, x["href"]] for x in table.contents[1].find_all("a")],
                       columns=["Laureate", "link"]).drop_duplicates()

nobel_df = nobel_df.merge(link_df, on="Laureate", how="left")
nobel_df.set_index("Laureate", inplace=True)
nobel_df.drop_duplicates(inplace=True)
nobel_df

Unnamed: 0_level_0,link
Laureate,Unnamed: 1_level_1
Wilhelm Conrad Röntgen,/wiki/Wilhelm_R%C3%B6ntgen
Hendrik Lorentz,/wiki/Hendrik_Lorentz
Pieter Zeeman,/wiki/Pieter_Zeeman
Antoine Henri Becquerel,/wiki/Henri_Becquerel
Pierre Curie,/wiki/Pierre_Curie
Maria Skłodowska-Curie,/wiki/Maria_Sk%C5%82odowska-Curie
Lord Rayleigh,"/wiki/John_Strutt,_3rd_Baron_Rayleigh"
Philipp Eduard Anton von Lenard,/wiki/Philipp_Lenard
Joseph John Thomson,/wiki/J._J._Thomson
Albert Abraham Michelson,/wiki/Albert_Abraham_Michelson


In [73]:
## We get the physics links
url = "https://en.wikipedia.org/wiki/Physics"

http = Http()
status, response = http.request(url)

table = BeautifulSoup(response, "lxml", parse_only=SoupStrainer('table'))
physics_df = pd.DataFrame([[x.string.lower(), x["href"].lower()] for x in table.contents[2].find_all("a")],
                       columns=["Physics_domain", "link"]).drop_duplicates()

physics_df = physics_df.groupby("Physics_domain").first()
physics_df

Unnamed: 0_level_0,link
Physics_domain,Unnamed: 1_level_1
accelerator physics,/wiki/accelerator_physics
acoustics,/wiki/acoustics
agrophysics,/wiki/agrophysics
antimatter,/wiki/antimatter
applied physics,/wiki/applied_physics
astrometry,/wiki/astrometry
astronomy,/wiki/astronomy
astrophysics,/wiki/astrophysics
atom,/wiki/atom
atomic and molecular astrophysics,/wiki/atomic_and_molecular_astrophysics


In [74]:
import string
import nltk
from string import punctuation
from nltk.corpus import stopwords
nltk.download('stopwords')
words_to_remove = set(stopwords.words('english'))

## We get the bios
def get_text(link, root_website = "https://en.wikipedia.org"):    
    http = Http()
    status, response = http.request(root_website + link)
    body = BeautifulSoup(response, "lxml", parse_only=SoupStrainer("div", {"id":"mw-content-text"}))
    return BeautifulSoup.get_text(body.contents[1])

# TODO: copy your clean_string function from the previous homework
def clean_string(string):
    for p in punctuation + "1234567890":
        string = string.replace(p,'').lower()
    return string

 #TODO: copy your remove function from the previous homework
def remove(list_to_clean, element_to_remove=[None, ""]):
    list_cleaned = [x for x in list_to_clean if x not in element_to_remove]
    return list_cleaned

# TODO: copy your remove_one function from the previous homework
def remove_one(list_to_clean):
    list_to_clean_one = [x for x in list_to_clean if len(x) > 1]
    return list_to_clean_one

[nltk_data] Downloading package stopwords to /Users/BPD/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [75]:
from nltk.corpus import stopwords
words_to_remove = set(stopwords.words('english'))

# TODO: aggregate all the above function into one to return a list of words from each link
def clean_everything(df):
    return (df["link"].apply(get_text)
           .apply(clean_string)
           .str.split("\s")
           .apply(remove)
           .apply(remove, element_to_remove = words_to_remove)
           .apply(remove_one))

physics_df["physics_list"] = clean_everything(physics_df)
nobel_df["physics_list"] = clean_everything(nobel_df)
nobel_df

Unnamed: 0_level_0,link,physics_list
Laureate,Unnamed: 1_level_1,Unnamed: 2_level_1
Wilhelm Conrad Röntgen,/wiki/Wilhelm_R%C3%B6ntgen,"[wilhelm, röntgen, born, wilhelm, conrad, rönt..."
Hendrik Lorentz,/wiki/Hendrik_Lorentz,"[confused, hendrikus, albertus, lorentz, ludvi..."
Pieter Zeeman,/wiki/Pieter_Zeeman,"[pieter, zeeman, born, may, zonnemaire, nether..."
Antoine Henri Becquerel,/wiki/Henri_Becquerel,"[uses, see, becquerel, disambiguation, antoine..."
Pierre Curie,/wiki/Pierre_Curie,"[pierre, curie, born, may, paris, france, died..."
Maria Skłodowska-Curie,/wiki/Maria_Sk%C5%82odowska-Curie,"[article, polish, physicist, uses, see, marie,..."
Lord Rayleigh,"/wiki/John_Strutt,_3rd_Baron_Rayleigh","[lord, rayleigh, om, prs, born, november, lang..."
Philipp Eduard Anton von Lenard,/wiki/Philipp_Lenard,"[waterfall, effect, redirects, illusory, visua..."
Joseph John Thomson,/wiki/J._J._Thomson,"[article, nobel, laureate, physicist, moral, p..."
Albert Abraham Michelson,/wiki/Albert_Abraham_Michelson,"[confused, athlete, albert, michelsen, albert,..."


In [76]:
len(list(nobel_df["physics_list"][0]))

1508

In [77]:
# TODO: find all the words in nobel_df["physics_list"]
all_nobel_words = set(nobel_df["physics_list"].sum())

# TODO: find all the words in physics_df["physics_list"]
all_physics_words = set(physics_df["physics_list"].sum())

# TODO: find all the intersection of all_nobel_words and all_physics_words
physics_corpus = set(all_nobel_words).intersection(all_physics_words)

physics_corpus

{'thick',
 'filed',
 'ethan',
 'le',
 'computergenerated',
 'potentially',
 'triple',
 'perovskites',
 'class',
 'careful',
 'promptly',
 'mueller',
 'fortunately',
 'criticize',
 'galileo',
 'phys',
 'pairing',
 'quark',
 'niels',
 'alumnus',
 'preface',
 'tu',
 'usgs',
 'metric',
 'koshiba',
 'replaced',
 'tomonaga',
 'legislation',
 'dimensions',
 'honors',
 'bailey',
 'bishop',
 'jay',
 'cultural',
 'reception',
 'zhao',
 'international',
 'mineral',
 'originally',
 'laws',
 'illdefined',
 'innovators',
 'overviewedit',
 'showed',
 'necessary',
 'tam',
 'indefinite',
 'crusts',
 'imprint',
 'eventual',
 'archiveorg',
 'barack',
 'franck',
 'convection',
 'earthquake',
 'orlando',
 'brian',
 'haas',
 'comparing',
 'chile',
 'requisite',
 'flows',
 'served',
 'chip',
 'fresnel',
 'gunnar',
 'körper',
 'truth',
 'tales',
 'compound',
 'isotopes',
 'bibcodeapjss',
 'ballistics',
 'maine',
 'tennis',
 'diesel',
 'disputes',
 'singularity',
 'scales',
 'corpuscle',
 'greenhouse',
 'ecg',

In [78]:
print(len(physics_corpus), len(all_nobel_words), len(all_physics_words))

12619 31054 33479


In [79]:
# TODO: write a function that keep only specific words from a list
def keep_only(list_to_clean, corpus=physics_corpus):
    list_clean = [x for x in list_to_clean if x in corpus]
    return list(list_clean)
    
nobel_df["physics_list_clean"] = nobel_df["physics_list"].apply(keep_only)
physics_df["physics_list_clean"] = physics_df["physics_list"].apply(keep_only)


In [80]:

# TODO: compute the length of each list
#def compute(compute_list):
#    return len(set((compute_list)))
#nobel_df["length"] = nobel_df["physics_list_clean"].apply(compute)
#physics_df["length"] = physics_df["physics_list_clean"].apply(compute)
nobel_df["length"]=[len(x) for x in nobel_df["physics_list_clean"]]
physics_df["length"]=[len(x) for x in physics_df["physics_list_clean"]]
# TODO: Set this column to 1
nobel_df["group"] = 1
# TODO: Set this column to 0
physics_df["group"] = 0
nobel_df

Unnamed: 0_level_0,link,physics_list,physics_list_clean,length,group
Laureate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Wilhelm Conrad Röntgen,/wiki/Wilhelm_R%C3%B6ntgen,"[wilhelm, röntgen, born, wilhelm, conrad, rönt...","[wilhelm, röntgen, born, wilhelm, conrad, rönt...",1245,1
Hendrik Lorentz,/wiki/Hendrik_Lorentz,"[confused, hendrikus, albertus, lorentz, ludvi...","[confused, lorentz, lorenz, see, also, lorentz...",2710,1
Pieter Zeeman,/wiki/Pieter_Zeeman,"[pieter, zeeman, born, may, zonnemaire, nether...","[pieter, zeeman, born, may, netherlands, died,...",984,1
Antoine Henri Becquerel,/wiki/Henri_Becquerel,"[uses, see, becquerel, disambiguation, antoine...","[uses, see, becquerel, disambiguation, henri, ...",1156,1
Pierre Curie,/wiki/Pierre_Curie,"[pierre, curie, born, may, paris, france, died...","[pierre, curie, born, may, paris, france, died...",1428,1
Maria Skłodowska-Curie,/wiki/Maria_Sk%C5%82odowska-Curie,"[article, polish, physicist, uses, see, marie,...","[article, polish, physicist, uses, see, marie,...",4943,1
Lord Rayleigh,"/wiki/John_Strutt,_3rd_Baron_Rayleigh","[lord, rayleigh, om, prs, born, november, lang...","[lord, rayleigh, om, born, november, langford,...",1412,1
Philipp Eduard Anton von Lenard,/wiki/Philipp_Lenard,"[waterfall, effect, redirects, illusory, visua...","[effect, redirects, visual, motion, effect, se...",1242,1
Joseph John Thomson,/wiki/J._J._Thomson,"[article, nobel, laureate, physicist, moral, p...","[article, nobel, laureate, physicist, moral, p...",2889,1
Albert Abraham Michelson,/wiki/Albert_Abraham_Michelson,"[confused, athlete, albert, michelsen, albert,...","[confused, albert, albert, michelson, born, de...",2227,1


In [81]:
physics_df

Unnamed: 0_level_0,link,physics_list,physics_list_clean,length,group
Physics_domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
accelerator physics,/wiki/accelerator_physics,"[accelerator, physics, branch, applied, physic...","[accelerator, physics, branch, applied, physic...",853,0
acoustics,/wiki/acoustics,"[uses, see, acoustics, disambiguation, artific...","[uses, see, acoustics, disambiguation, artific...",2047,0
agrophysics,/wiki/agrophysics,"[article, includes, list, references, related,...","[article, includes, list, references, related,...",400,0
antimatter,/wiki/antimatter,"[uses, see, antimatter, disambiguation, antima...","[uses, see, antimatter, disambiguation, antima...",3342,0
applied physics,/wiki/applied_physics,"[see, also, outline, applied, physics, enginee...","[see, also, outline, applied, physics, enginee...",283,0
astrometry,/wiki/astrometry,"[illustration, use, interferometry, optical, w...","[illustration, use, interferometry, optical, w...",1133,0
astronomy,/wiki/astronomy,"[article, scientific, study, celestial, object...","[article, scientific, study, celestial, object...",5169,0
astrophysics,/wiki/astrophysics,"[article, use, physics, chemistry, determine, ...","[article, use, physics, chemistry, determine, ...",1958,0
atom,/wiki/atom,"[uses, see, atom, disambiguation, helium, atom...","[uses, see, atom, disambiguation, helium, atom...",6602,0
atomic and molecular astrophysics,/wiki/atomic_and_molecular_astrophysics,"[within, million, years, light, bright, stars,...","[within, million, years, light, bright, stars,...",490,0


In [82]:
# TODO: concatenate those two dataframe into the nodes_df dataframe. 
# keep only the "length" and "group" columns.
nodes_df =  pd.concat([nobel_df[['length','group']], physics_df[['length','group']]], axis = 0)

nodes_df.index.name = "id"
nodes_df

Unnamed: 0_level_0,length,group
id,Unnamed: 1_level_1,Unnamed: 2_level_1
Wilhelm Conrad Röntgen,1245,1
Hendrik Lorentz,2710,1
Pieter Zeeman,984,1
Antoine Henri Becquerel,1156,1
Pierre Curie,1428,1
Maria Skłodowska-Curie,4943,1
Lord Rayleigh,1412,1
Philipp Eduard Anton von Lenard,1242,1
Joseph John Thomson,2889,1
Albert Abraham Michelson,2227,1


In [83]:
nodes_list = list(nodes_df.reset_index().transpose().to_dict().values())
nodes_list

[{'group': 1, 'id': 'Wilhelm Conrad Röntgen', 'length': 1245},
 {'group': 1, 'id': 'Hendrik Lorentz', 'length': 2710},
 {'group': 1, 'id': 'Pieter Zeeman', 'length': 984},
 {'group': 1, 'id': 'Antoine Henri Becquerel', 'length': 1156},
 {'group': 1, 'id': 'Pierre Curie', 'length': 1428},
 {'group': 1, 'id': 'Maria Skłodowska-Curie', 'length': 4943},
 {'group': 1, 'id': 'Lord Rayleigh', 'length': 1412},
 {'group': 1, 'id': 'Philipp Eduard Anton von Lenard', 'length': 1242},
 {'group': 1, 'id': 'Joseph John Thomson', 'length': 2889},
 {'group': 1, 'id': 'Albert Abraham Michelson', 'length': 2227},
 {'group': 1, 'id': 'Gabriel Lippmann', 'length': 1472},
 {'group': 1, 'id': 'Guglielmo Marconi', 'length': 4077},
 {'group': 1, 'id': 'Karl Ferdinand Braun', 'length': 832},
 {'group': 1, 'id': 'Johannes Diderik van der Waals', 'length': 2187},
 {'group': 1, 'id': 'Wilhelm Wien', 'length': 741},
 {'group': 1, 'id': 'Nils Gustaf Dalén', 'length': 778},
 {'group': 1, 'id': 'Heike Kamerlingh-Onne

In [84]:
# TODO: create a data frame with the index of nodes_df as columns and physics_corpus as index
words_vector = pd.DataFrame(columns = nodes_df.index.values, index = physics_corpus)
words_vector

Unnamed: 0,Wilhelm Conrad Röntgen,Hendrik Lorentz,Pieter Zeeman,Antoine Henri Becquerel,Pierre Curie,Maria Skłodowska-Curie,Lord Rayleigh,Philipp Eduard Anton von Lenard,Joseph John Thomson,Albert Abraham Michelson,...,superfluid,supernova,superstring theory,supersymmetry,surface physics,theory of everything,universe,vacuum energy,vehicle dynamics,weak
thick,,,,,,,,,,,...,,,,,,,,,,
filed,,,,,,,,,,,...,,,,,,,,,,
ethan,,,,,,,,,,,...,,,,,,,,,,
le,,,,,,,,,,,...,,,,,,,,,,
computergenerated,,,,,,,,,,,...,,,,,,,,,,
potentially,,,,,,,,,,,...,,,,,,,,,,
triple,,,,,,,,,,,...,,,,,,,,,,
perovskites,,,,,,,,,,,...,,,,,,,,,,
class,,,,,,,,,,,...,,,,,,,,,,
careful,,,,,,,,,,,...,,,,,,,,,,


In [85]:
#TODO: write a function that take a list and return the a word count
def count_words(list_to_count):
    return pd.Series(list_to_count).value_counts()

words_vector.loc[:,nobel_df.index] = nobel_df["physics_list_clean"].apply(count_words).transpose()
words_vector.loc[:,physics_df.index] = physics_df["physics_list_clean"].apply(count_words).transpose()
words_vector

Unnamed: 0,Wilhelm Conrad Röntgen,Hendrik Lorentz,Pieter Zeeman,Antoine Henri Becquerel,Pierre Curie,Maria Skłodowska-Curie,Lord Rayleigh,Philipp Eduard Anton von Lenard,Joseph John Thomson,Albert Abraham Michelson,...,superfluid,supernova,superstring theory,supersymmetry,surface physics,theory of everything,universe,vacuum energy,vehicle dynamics,weak
thick,,,,1,,,,1,,,...,,1,,,,,,,,
filed,,,,,,,,,,,...,,,,,,,,,,
ethan,,,,,,,,,,,...,,,,,,,,,,
le,,2,,3,,,,,,,...,,,,,,,,,,
computergenerated,,,,,,,,,,,...,1,,,,,,,,,
potentially,,,,,,,,,,,...,,3,,,,,,,,
triple,,,,,,,,,,,...,1,,,,,,,,,
perovskites,,,,,,,,,,,...,,,,,,,,,,
class,,1,,1,,,,,,1,...,,7,,2,,,,,,
careful,,,1,,,,,,,,...,,,,,,,,,,


In [86]:
# TODO: fill the missing values
words_vector = words_vector.fillna(0)
words_vector

Unnamed: 0,Wilhelm Conrad Röntgen,Hendrik Lorentz,Pieter Zeeman,Antoine Henri Becquerel,Pierre Curie,Maria Skłodowska-Curie,Lord Rayleigh,Philipp Eduard Anton von Lenard,Joseph John Thomson,Albert Abraham Michelson,...,superfluid,supernova,superstring theory,supersymmetry,surface physics,theory of everything,universe,vacuum energy,vehicle dynamics,weak
thick,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
filed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ethan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
le,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
computergenerated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
potentially,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
triple,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
perovskites,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
class,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,7.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
careful,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:

def compute_similarity(vect1, vect2):
    return vect1.dot(vect2) / (np.sqrt(vect1.pow(2).sum()) * np.sqrt(vect2.pow(2).sum()))
similarity_df = pd.DataFrame(columns=words_vector.columns, index=words_vector.columns, dtype=float)

# TODO: fill the similarity_df dataframe with the cosine similarity

T = words_vector.transpose()
W = words_vector

for i in T.index.values:
    row = T.loc[i]
    
    for j in list(W.columns):
        col= W.loc[:,j]
        
        similarity_df.loc[i,j]= compute_similarity(row,col)
        
similarity_df
    
# TODO: bonus points if you can compute this dataframe using matrix algebra 
###




Unnamed: 0,Wilhelm Conrad Röntgen,Hendrik Lorentz,Pieter Zeeman,Antoine Henri Becquerel,Pierre Curie,Maria Skłodowska-Curie,Lord Rayleigh,Philipp Eduard Anton von Lenard,Joseph John Thomson,Albert Abraham Michelson,...,superfluid,supernova,superstring theory,supersymmetry,surface physics,theory of everything,universe,vacuum energy,vehicle dynamics,weak
Wilhelm Conrad Röntgen,1.000000,0.185514,0.234914,0.246373,0.244619,0.262763,0.281385,0.332172,0.271751,0.232699,...,0.070323,0.062752,0.068571,0.082544,0.075820,0.096970,0.101666,0.063021,0.055913,0.099332
Hendrik Lorentz,0.185514,1.000000,0.370383,0.218719,0.179621,0.179307,0.204048,0.259972,0.179071,0.277052,...,0.122875,0.074342,0.217889,0.207698,0.073640,0.294815,0.168868,0.152484,0.060420,0.127891
Pieter Zeeman,0.234914,0.370383,1.000000,0.256890,0.209483,0.210876,0.216192,0.284003,0.208562,0.274891,...,0.095226,0.074907,0.071211,0.097682,0.074663,0.119827,0.092988,0.096831,0.045164,0.117097
Antoine Henri Becquerel,0.246373,0.218719,0.256890,1.000000,0.309496,0.277841,0.198786,0.246859,0.210715,0.216439,...,0.072759,0.086852,0.064611,0.089502,0.088283,0.098501,0.109708,0.089068,0.047054,0.100038
Pierre Curie,0.244619,0.179621,0.209483,0.309496,1.000000,0.799176,0.230929,0.214641,0.199779,0.199546,...,0.073302,0.061699,0.058904,0.087582,0.102917,0.097067,0.105868,0.067250,0.040900,0.097120
Maria Skłodowska-Curie,0.262763,0.179307,0.210876,0.277841,0.799176,1.000000,0.252236,0.244053,0.196322,0.233136,...,0.090008,0.068378,0.068399,0.100210,0.105148,0.118328,0.143206,0.064112,0.051933,0.114876
Lord Rayleigh,0.281385,0.204048,0.216192,0.198786,0.230929,0.252236,1.000000,0.231572,0.343288,0.349819,...,0.079095,0.058966,0.139149,0.121494,0.078674,0.173621,0.116832,0.068164,0.067301,0.116053
Philipp Eduard Anton von Lenard,0.332172,0.259972,0.284003,0.246859,0.214641,0.244053,0.231572,1.000000,0.315187,0.265662,...,0.138252,0.075404,0.118009,0.154903,0.102583,0.208230,0.162926,0.139365,0.066741,0.140390
Joseph John Thomson,0.271751,0.179071,0.208562,0.210715,0.199779,0.196322,0.343288,0.315187,1.000000,0.268244,...,0.089486,0.083433,0.109240,0.134410,0.103932,0.158587,0.141124,0.097206,0.077102,0.146513
Albert Abraham Michelson,0.232699,0.277052,0.274891,0.216439,0.199546,0.233136,0.349819,0.265662,0.268244,1.000000,...,0.087835,0.098555,0.077252,0.092430,0.072566,0.123213,0.134418,0.073895,0.064555,0.091772


In [94]:
# TODO: reset the index and melt the dataframe

similarity_df.reset_index(inplace=True)
melted_df = pd.melt(similarity_df, id_vars = "index")

melted_df.columns =["source", "target", "value"]
melted_df

Unnamed: 0,source,target,value
0,Wilhelm Conrad Röntgen,Wilhelm Conrad Röntgen,1.000000
1,Hendrik Lorentz,Wilhelm Conrad Röntgen,0.185514
2,Pieter Zeeman,Wilhelm Conrad Röntgen,0.234914
3,Antoine Henri Becquerel,Wilhelm Conrad Röntgen,0.246373
4,Pierre Curie,Wilhelm Conrad Röntgen,0.244619
5,Maria Skłodowska-Curie,Wilhelm Conrad Röntgen,0.262763
6,Lord Rayleigh,Wilhelm Conrad Röntgen,0.281385
7,Philipp Eduard Anton von Lenard,Wilhelm Conrad Röntgen,0.332172
8,Joseph John Thomson,Wilhelm Conrad Röntgen,0.271751
9,Albert Abraham Michelson,Wilhelm Conrad Röntgen,0.232699


In [95]:
melted_df = melted_df.sample(frac=1.).reset_index(drop=True)
melted_df

Unnamed: 0,source,target,value
0,vacuum energy,nuclear astrophysics,0.298538
1,Max Planck,Claude Cohen-Tannoudji,0.190538
2,dark energy,Brian David Josephson,0.120895
3,Charles K. Kao,antimatter,0.178398
4,quantum information science,Sheldon Lee Glashow,0.088142
5,quantum electronics,Hannes Olof Gösta Alfvén,0.116857
6,fluid dynamics,Nevill Francis Mott,0.072912
7,spin,chemical physics,0.135869
8,weak,photonics,0.059067
9,David J. Wineland,Frits Zernike,0.379630


In [96]:
# TODO: merge melted_df with itself

merged_df = melted_df.reset_index().merge(melted_df.reset_index(),left_on=["source", "target"], right_on=["target", "source"]) 

merged_df

Unnamed: 0,index_x,source_x,target_x,value_x,index_y,source_y,target_y,value_y
0,0,vacuum energy,nuclear astrophysics,0.298538,49184,nuclear astrophysics,vacuum energy,0.298538
1,1,Max Planck,Claude Cohen-Tannoudji,0.190538,74918,Claude Cohen-Tannoudji,Max Planck,0.190538
2,2,dark energy,Brian David Josephson,0.120895,96830,Brian David Josephson,dark energy,0.120895
3,3,Charles K. Kao,antimatter,0.178398,51545,antimatter,Charles K. Kao,0.178398
4,4,quantum information science,Sheldon Lee Glashow,0.088142,40132,Sheldon Lee Glashow,quantum information science,0.088142
5,5,quantum electronics,Hannes Olof Gösta Alfvén,0.116857,77053,Hannes Olof Gösta Alfvén,quantum electronics,0.116857
6,6,fluid dynamics,Nevill Francis Mott,0.072912,26039,Nevill Francis Mott,fluid dynamics,0.072912
7,7,spin,chemical physics,0.135869,14003,chemical physics,spin,0.135869
8,8,weak,photonics,0.059067,31592,photonics,weak,0.059067
9,9,David J. Wineland,Frits Zernike,0.379630,82323,Frits Zernike,David J. Wineland,0.379630


In [105]:
# TODO: find the index to drop

index_to_drop = pd.Series([max(x) for x in merged_df.loc[:,["index_x", "index_y"]].values.tolist()]).unique()
index_to_drop

# TODO: use the index_to_drop to subset the melted_df dataframe
#droping the index values
melted_df_sub = melted_df.drop(index_to_drop)
melted_df_sub

Unnamed: 0,source,target,value
0,vacuum energy,nuclear astrophysics,0.298538
1,Max Planck,Claude Cohen-Tannoudji,0.190538
2,dark energy,Brian David Josephson,0.120895
3,Charles K. Kao,antimatter,0.178398
4,quantum information science,Sheldon Lee Glashow,0.088142
5,quantum electronics,Hannes Olof Gösta Alfvén,0.116857
6,fluid dynamics,Nevill Francis Mott,0.072912
7,spin,chemical physics,0.135869
8,weak,photonics,0.059067
9,David J. Wineland,Frits Zernike,0.379630


In [114]:
# TODO: Group melted_df_sub by "source" using the groupby method and select the 10 
# targets that have the highest values using the nlargest method

largest_df =  melted_df_sub.groupby("source").value.nlargest(10)
pd.DataFrame(largest_df)
#largest_df[0]

# TODO: get the level 1 of the multiindex
index_to_keep = largest_df.index.get_level_values(1)
index_to_keep

links_df = melted_df_sub.loc[index_to_keep]
links_df

Unnamed: 0,source,target,value
35860,Aage Bohr,Niels Bohr,0.809122
44712,Aage Bohr,Ben Roy Mottelson,0.650058
100733,Aage Bohr,James Franck,0.456316
62522,Aage Bohr,Frederick Reines,0.428806
54999,Aage Bohr,Norman Foster Ramsey,0.426159
34575,Aage Bohr,John Robert Schrieffer,0.422783
35088,Aage Bohr,Bertram Brockhouse,0.418653
43706,Aage Bohr,Felix Bloch,0.417438
16082,Aage Bohr,Yoichiro Nambu,0.415112
20174,Aage Bohr,David J. Thouless,0.409298


In [115]:
# TODO: create the list of links
links_list =  list(links_df.T.to_dict().values())
links_list

[{'source': 'Aage Bohr', 'target': 'Niels Bohr', 'value': 0.8091223913917167},
 {'source': 'Aage Bohr',
  'target': 'Ben Roy Mottelson',
  'value': 0.6500584939040946},
 {'source': 'Aage Bohr',
  'target': 'James Franck',
  'value': 0.45631643947029027},
 {'source': 'Aage Bohr',
  'target': 'Frederick Reines',
  'value': 0.4288055080080481},
 {'source': 'Aage Bohr',
  'target': 'Norman Foster Ramsey',
  'value': 0.4261594264323901},
 {'source': 'Aage Bohr',
  'target': 'John Robert Schrieffer',
  'value': 0.4227829386774369},
 {'source': 'Aage Bohr',
  'target': 'Bertram Brockhouse',
  'value': 0.41865340466174716},
 {'source': 'Aage Bohr', 'target': 'Felix Bloch', 'value': 0.4174375937928075},
 {'source': 'Aage Bohr',
  'target': 'Yoichiro Nambu',
  'value': 0.4151124271867487},
 {'source': 'Aage Bohr',
  'target': 'David J. Thouless',
  'value': 0.4092982896013396},
 {'source': 'Abdus Salam',
  'target': 'Sheldon Lee Glashow',
  'value': 0.32982208928687273},
 {'source': 'Abdus Salam

In [124]:
network_dict = {"nodes": nodes_list,
                "links": links_list}

with open("./data/physicists.json","w") as f:
    json.dump(network_dict, f, indent=4)

In [125]:
import os
os.system("open -a /Applications/Safari.app ./index.html")

0