In [72]:
import csv
import networkx as nx
import pandas as pd
import numpy as np

In [17]:
%%time

#############################################
# Case number to ID map
#############################################

casenum_id_file = 'mapping.csv'

# CASE NUM --> CASE ID
casenum_id_map = {}

reader = csv.reader(open(casenum_id_file))

# Skip header
next(reader, None)

for row in reader:
    casenum_id_map[int(row[0])] = row[1]

CPU times: user 26.6 ms, sys: 1.19 ms, total: 27.8 ms
Wall time: 27.1 ms


In [57]:
%%time

#############################################
# Case to author map
#############################################

case_author_file = 'final_feats_without_dummies_5.csv'

# CASE ID <--> AUTHOR
case_author_map = {}
author_case_map = {}

reader = csv.reader(open(case_author_file))

# Skip header
next(reader, None)

for row in reader:
    
    casenum, judge_id = int(float(row[1])), int(float(row[2]))
    
    if casenum not in casenum_id_map:
        continue
    
    # Convert case id to case num
    caseid = casenum_id_map[casenum]
    
    if caseid in case_author_map:
        continue
    
    case_author_map[ caseid ] = judge_id

    if judge_id not in author_case_map:
        author_case_map[ judge_id ] = {}

    author_case_map[ judge_id ][caseid] = None
        

CPU times: user 177 ms, sys: 4.34 ms, total: 181 ms
Wall time: 179 ms


In [39]:
#############################################
# Citation graph
#############################################

def get_file_obj(filename):
    return open(filename, 'r')


def get_year_map(filename):
    f = get_file_obj(filename)
    
    year_map = {}
    
    for line in f:
        year, from_case, to_case = line[:-1].split(',')
        
        if from_case not in year:
            year_map[from_case] = int(year)
        
    f.close()
    
    return year_map

In [33]:
citation_graph_data = 'CitationGraph/data/graph_stripped.csv'

year_map = get_year_map(citation_graph_data)

In [37]:
%%time

citation_graph_map = {}

reader = csv.reader(open(citation_graph_data))
g = nx.DiGraph()

for row in reader:
    
    year, from_case, to_case = row[0], row[1], row[2]
    
    # Citing a future case is invalid
    if (to_case in year_map) and (from_case in year_map) and (year_map[from_case] < year_map[to_case]):
        continue
        
    # If the cite each other, it is most likely that both are a wrong edge
    if (to_case in g) and (from_case in g[to_case]):
        del g[to_case][from_case]
        continue
        
    g.add_edge(from_case, to_case)

CPU times: user 27.1 s, sys: 927 ms, total: 28 s
Wall time: 28 s


In [67]:
#############################################
# Generate judge pairs
#############################################

df = pd.read_csv('final_feats_without_dummies_3.csv', low_memory=False)

cols_to_remove = list(df.columns)

cols_to_remove.remove('codej1')
cols_to_remove.remove('codej2')

df.drop(labels=cols_to_remove, axis=1, inplace=True)

df.to_csv('judge_pairs.csv')

In [70]:
%%time

#############################################
# Generate citation count for judge pairs
#############################################

judge_pairs_csv = 'judge_pairs.csv'

reader = csv.reader(open(judge_pairs_csv))

# Skip header
next(reader, None)

cites = []
g_cnt = 0

for row in reader:
    
    idx, j1, j2 = row[0], int(float(row[1])), int(float(row[2]))
    
    count = 0
    
    # Iterate over j1's cases, see how many are citing something of j2
    if j1 in author_case_map:
    
        for case in author_case_map[j1]:

            # See the citations of this case
            if case not in g:
                continue

            for citation in g[case]:

                # See if j2 is its author
                if (citation in case_author_map) and (case_author_map[citation] == j2):
                    count += 1
    
    cites.append(count)

CPU times: user 9.68 s, sys: 43.1 ms, total: 9.72 s
Wall time: 9.73 s


In [73]:
#############################################
# Write out cites to file
# Yay! Done :D
#############################################

np.savetxt('cite_counts.csv', cites, fmt='%s')

In [90]:
%%time
# Append to original dataframe

df = pd.read_csv('final_feats_without_dummies_3.csv', low_memory=False)
df['cite_count'] = pd.Series(cites, index=df.index)
df.to_csv('final_feats_4.csv')

CPU times: user 55.9 s, sys: 2.96 s, total: 58.8 s
Wall time: 58.9 s
