### Load the DataFrame, produce an adjacency matrix where vertices are the patents (vertex i corresponds to row i of the dataset) and an edge indicates that vertices i and j have the relation cites or is cited by in the original dataset, or they share a cited patent that is or is not in the dataset

In [None]:
import pandas as pd
import scipy.sparse as sp
import scipy.stats
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

datasetfname = "uspto_grants_all_H_sections.hdf5"
try:
    df
except NameError:
    df = pd.read_hdf(datasetfname)
df.reset_index(inplace=True,drop=True)

In [None]:
import scipy.sparse as sp

patentNums = df["publishedPatentDocNumber"].tolist()
applicantCitations = df["applicantCitations"].tolist()
examinerCitations = df["examinerCitations"].tolist()

uniqueCitations = set([])
for idx in xrange(len(applicantCitations)):
    uniqueCitations.update(applicantCitations[idx]) 
for idx in xrange(len(examinerCitations)):
    uniqueCitations.update(examinerCitations[idx])
    
vertexList = list(patentNums)
vertexList.extend(uniqueCitations)

docToVertexLUT = {}
for idx in xrange(len(vertexList)):
    docToVertexLUT[vertexList[idx]] = idx
    
# B = csr_matrix((data, indices, colIndexPointers)) will be a
# patents-by-numvertices adjacency matrix stored in CSR format
colIndexPointers = [0]*(len(patentNums)+1)
for rowNum in xrange(len(patentNums)):
    colIndexPointers[rowNum+1] = colIndexPointers[rowNum] + len(applicantCitations[rowNum]) + len(examinerCitations[rowNum])
    
indices = [0]*colIndexPointers[-1]
for rowNum in xrange(len(patentNums)):
    if (rowNum % 1000 == 0):
        print('Populating row {0} of {1}'.format(rowNum, len(patentNums)))
    citations = applicantCitations[rowNum]
    citations.extend(examinerCitations[rowNum])
    indices[colIndexPointers[rowNum]:colIndexPointers[rowNum+1]] = map(lambda doc: docToVertexLUT[doc], citations) 
    
data = [1]*len(indices)
B = sp.csr_matrix((data, indices, colIndexPointers))
C = np.dot(B, B.T)
C = C + C.T

In [None]:
adjacencymatfname = "one_hop_adjacency_all_H.txt"

def export_sparsemat(spmat, outfname):
    (i,j,v) = sp.find(spmat)
    with open(outfname, "w") as fout:
        for idx in xrange(len(i)):
            if (i[idx] == j[idx]):
                continue
            if (idx % 100000 == 0):
                print("Exporting edge {0} of {1}".format(idx+1, len(i)))
            fout.write("{0} {1}\n".format(i[idx], j[idx]))
export_sparsemat(C, adjacencymatfname)