/
closedcasepdfs.py
442 lines (409 loc) · 18.5 KB
/
closedcasepdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
import csv, requests, bs4, PyPDF2
from datetime import date
# in a case list, docket number is in the 2nd column
def getdocketnum(row):
cells = []
cells += row.find_all('td')
docketnum = str(cells[1].get_text(strip=True))
return docketnum
# in a document search, docket number is in the 1st column
def getdocketnumindoctypesearch(row):
cells = []
cells += row.find_all('td')
docketnum = str(cells[0].get_text(strip=True))
return docketnum
def getnumamendclaims(docketurl):
res3 = requests.get(docketurl + '?max=100')
res3.raise_for_status()
casedocketsoup = bs4.BeautifulSoup(res3.text, 'lxml')
docketrows = casedocketsoup.find_all('tr')
currentrow = 0
numberofacs = 0
while currentrow < len(docketrows):
tds = []
cells = docketrows[currentrow].find_all('td')
for cell in cells:
tds.append(cell.get_text(strip=True))
if len(tds) > 2 and tds[2] == "Amended Claim":
numberofacs += 1
currentrow += 1
return numberofacs
def getoptouts(case):
partiesurl = 'https://dockets.ccb.gov/case/participants/' + case
resparties = requests.get(partiesurl)
resparties.raise_for_status()
partiessoup = bs4.BeautifulSoup(resparties.text, 'lxml')
partiestd = partiessoup.find_all(attrs={'headers' : 'colHeaderOptOutParty rowHeaderOPT_OUT'})
numofoptouts = len(partiestd)
respondents = []
if numofoptouts > 0:
for td in partiestd:
respondents.append(td.get_text(strip=True))
return (numofoptouts, respondents)
def getdismissalpdfurl(docketurl):
res2 = requests.get(docketurl + '?max=100')
res2.raise_for_status()
casedocketsoup = bs4.BeautifulSoup(res2.text, 'lxml')
docketrows = casedocketsoup.find_all('tr')
dismissalrow = 'No dismissal order row found'
currentrow = 0
while dismissalrow == 'No dismissal order row found' and currentrow < len(docketrows):
tds = []
cells = docketrows[currentrow].find_all('td')
for cell in cells:
tds.append(cell.get_text(strip=True))
if len(tds) > 2 and ("Dismissing Claim" in tds[1]):
dismissalrow = docketrows[currentrow]
elif len(tds) > 2 and ("Closing Case" in tds[1]):
dismissalrow = docketrows[currentrow]
else:
currentrow += 1
if dismissalrow == 'No dismissal order row found':
return 'No dismissal order found'
dismissalcell = dismissalrow.find_all('td')[1]
dismissallink = dismissalcell.a
dismissalpdf = dismissallink.get('href')
pdfurl = 'https://dockets.ccb.gov' + dismissalpdf
return pdfurl
def getpdftext(filename):
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
ordertext = pageObj.extractText()
return ordertext
# import data for all cases, as created by ccbcrawl2.py
print("Importing claims")
casedata = open('casedata.csv', 'r')
reader = csv.DictReader(casedata)
casedatadict = {}
for dictionary in reader:
casedictname = dictionary["Docket No."]
casedatadict[casedictname] = dictionary
casedata.close()
# import data about closed cases from last week
print("Importing closed cases data")
closedcasescsv = open('closedcases.csv', 'r')
reader = csv.DictReader(closedcasescsv)
closedcasesdict = {}
for dictionary in reader:
casedictname = dictionary["Docket No."]
closedcasesdict[casedictname] = dictionary
closedcasescsv.close()
# Import the data from the reasons/orders to amend report
print("Importing OTA info from CSV")
otareasonscsv = open('otareasons.csv', 'r')
reader = csv.DictReader(otareasonscsv)
otasdict = {}
for dictionary in reader:
documentnum = dictionary["Document No."]
otasdict[documentnum] = dictionary
otareasonscsv.close()
# get list of last week's closed cases
caseswehave = []
for case in closedcasesdict:
caseswehave.append(case)
# get list of respondents who've opted out, updated by this script last time
optoutrespondents = []
with open('optoutrespondents.csv', 'r') as f:
reader = csv.reader(f)
for row in reader:
optoutrespondents.append(row)
f.close()
optoutheaders = optoutrespondents.pop(0)
# import cases with orders to amend or certify, as created by amendorcertify.py
amends = []
amendfile = open('amendfile.txt', 'r')
for line in amendfile.readlines():
amends.append(line[:11])
amendfile.close()
certs = []
certfile = open('certfile.txt', 'r')
for line in certfile.readlines():
certs.append(line[:11])
certfile.close()
# Get the case numbers for the (first 100) cases from the closed case list
print("Getting closed cases")
allclosedcases = []
res = requests.get('https://dockets.ccb.gov/search/closed?max=100')
res.raise_for_status()
closedlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
closedtablerows = closedlistsoup.tbody.find_all('tr')
for row in closedtablerows:
allclosedcases.append(getdocketnum(row))
# Get the next 100
res = requests.get('https://dockets.ccb.gov/search/closed?&offset=100&max=100')
res.raise_for_status()
closedlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
closedtablerows = closedlistsoup.tbody.find_all('tr')
for row in closedtablerows:
allclosedcases.append(getdocketnum(row))
# Get the next 100
res = requests.get('https://dockets.ccb.gov/search/closed?&offset=200&max=100')
res.raise_for_status()
closedlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
closedtablerows = closedlistsoup.tbody.find_all('tr')
for row in closedtablerows:
allclosedcases.append(getdocketnum(row))
# Get the next 100
res = requests.get('https://dockets.ccb.gov/search/closed?&offset=300&max=100')
res.raise_for_status()
closedlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
closedtablerows = closedlistsoup.tbody.find_all('tr')
for row in closedtablerows:
allclosedcases.append(getdocketnum(row))
# Get the next 100
res = requests.get('https://dockets.ccb.gov/search/closed?&offset=400&max=100')
res.raise_for_status()
closedlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
closedtablerows = closedlistsoup.tbody.find_all('tr')
for row in closedtablerows:
allclosedcases.append(getdocketnum(row))
# Get the next 100
res = requests.get('https://dockets.ccb.gov/search/closed?&offset=500&max=100')
res.raise_for_status()
closedlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
closedtablerows = closedlistsoup.tbody.find_all('tr')
for row in closedtablerows:
allclosedcases.append(getdocketnum(row))
print("closedcasepdfs.py: When this gets to 100, add another URL to fetch more closed cases")
print(str(len(closedtablerows)))
allclosedcases.sort()
newclosedcases = []
for case in allclosedcases:
if case not in caseswehave:
newclosedcases.append(case)
print("New closed cases:")
print(newclosedcases)
# start collecting info about new closed cases in the closed cases dictionary
for case in newclosedcases:
docketurl = 'https://dockets.ccb.gov/case/detail/' + case
lawfirm = casedatadict[case]["Claimant law firm"]
closedcasesdict[case] = {"Docket No." : case, "Docket URL" : docketurl, "Claimant law firm" : lawfirm}
# add info to the dictionary about orders to amend and orders certifying claims; infer reasons where possible
print("Counting amended claims and opt outs")
for case in newclosedcases:
print(case)
closedcasesdict[case]["Amend orders"] = amends.count(case)
closedcasesdict[case]["Certifying orders"] = certs.count(case)
docketurl = closedcasesdict[case]["Docket URL"]
if amends.count(case) > 0:
closedcasesdict[case]["Amended claims"] = getnumamendclaims(docketurl)
else:
closedcasesdict[case]["Amended claims"] = "Didn't count"
if case in certs:
optoutinfo = getoptouts(case)
closedcasesdict[case]["Opt outs"] = optoutinfo[0]
if optoutinfo[0] > 0:
respondents = optoutinfo[1]
for respondent in respondents:
optoutrespondents.append([case, respondent])
else:
closedcasesdict[case]["Opt outs"] = "-"
if closedcasesdict[case]["Certifying orders"] > 0 and closedcasesdict[case]["Opt outs"] > 0:
closedcasesdict[case]["Inferred reason"] = "Respondent(s) opted out"
elif closedcasesdict[case]["Amended claims"] == 2 and closedcasesdict[case]["Amend orders"] == 2:
closedcasesdict[case]["Inferred reason"] = "3 tries and still noncompliant"
elif closedcasesdict[case]["Amend orders"] > 0 and closedcasesdict[case]["Certifying orders"] == 0:
closedcasesdict[case]["Inferred reason"] = "Failure to amend"
else:
closedcasesdict[case]["Inferred reason"] = "Unknown"
# Get PDF URLs for each closed case
print("Getting PDF URLs, designating (future) local filenames")
for case in newclosedcases:
print(case)
dismissalpdfurl = getdismissalpdfurl('https://dockets.ccb.gov/case/detail/' + case)
closedcasesdict[case]["Dismissal PDF URL"] = dismissalpdfurl
if dismissalpdfurl == "No dismissal order found":
closedcasesdict[case]["PDF filename"] = 'None'
else:
closedcasesdict[case]["PDF filename"] = 'pdfs/' + case + 'dismissalorder.pdf'
# Save PDFs locally
print("Saving PDFs locally")
for case in newclosedcases:
print(case)
dismissalpdfurl = closedcasesdict[case]["Dismissal PDF URL"]
if dismissalpdfurl != "No dismissal order found":
res = requests.get(dismissalpdfurl)
res.raise_for_status()
pdffile = open('pdfs/' + case + 'dismissalorder.pdf', 'wb')
for chunk in res.iter_content(100000):
pdffile.write(chunk)
pdffile.close()
else:
print(dismissalpdfurl + " for case " + case)
# Get text from first page of PDF
print("Extracting text from dismissal order PDFs")
for case in newclosedcases:
filename = closedcasesdict[case]["PDF filename"]
if filename == "None":
pdfreason = "Unknown/cannot extract"
else:
pdftext = ''
pdfreason = "Unknown"
pdftext = getpdftext(filename)
print(pdftext)
if 'opt-out' in pdftext:
pdfreason = "Respondent(s) opted out"
elif 'second amended claim' in pdftext:
pdfreason = "3 tries and still noncompliant"
elif "did not receive the respondent's address" in pdftext or "did not receive the respondent's address" in pdftext:
pdfreason = "Failure to provide respondent address"
elif 'payment for the claim failed' in pdftext or 'payment for your claim failed' in pdftext:
pdfreason = "Payment for the claim failed"
elif 'request from the claimant' in pdftext or 'request to dismiss from' in pdftext:
pdfreason = "Request from claimant"
elif 'did not file a proof of service or waiver of service' in pdftext:
pdfreason = "Proof of service not filed"
elif 'No amended claim was filed in the time allowed' in pdftext:
pdfreason = "Failure to amend"
elif 'applied to register the copyright in the work and had filed a new' in pdftext:
pdfreason = "Work wasn't registered before; claimant has filed new claim"
elif 'Copyright Office refused' in pdftext or 'allegedly infringed work has been refused' in pdftext:
pdfreason = "Copyright registration refused by Copyright Office"
elif 'grants the request, dismisses the claim with prejudice' in pdftext:
pdfreason = "Settlement, dismissed with prejudice"
elif 'FINDING OF BAD FAITH' in pdftext:
pdfreason = "Bad-faith conduct"
elif 'not submit the second payment by' in pdftext:
pdfreason = "Second filing fee not paid"
else:
pdfreason = "Unknown/cannot extract"
closedcasesdict[case]["PDF reason"] = pdfreason
# Assign a reason for tallying at the end of the report
for case in newclosedcases:
if closedcasesdict[case]["PDF reason"] != "Unknown/cannot extract":
closedcasesdict[case]["Tallied reason"] = closedcasesdict[case]["PDF reason"]
else:
closedcasesdict[case]["Tallied reason"] = closedcasesdict[case]["Inferred reason"]
closedcasesdatalist = [value for value in closedcasesdict.values()]
closedcasesdatalist.sort(key=lambda x: x['Docket No.'])
htmlreport = open("closedcases.html", 'w')
htmlreport.write('<!DOCTYPE html>' + '\n' + '<html lang="en">' + '\n' +
'<head><title>CCB data - closed cases</title>' + '\n' +
'<style>' + '\n' + 'table, th, td {' + '\n' + ' border: 1px solid #ddd;' + '\n' +
' border-collapse: collapse;' + '\n' + ' }' +
'\n' + 'th, td {' + '\n' + ' padding: 6px;' + '\n' + ' }' +
'\n' + 'tr:nth-child(odd) {' + '\n' + ' background-color: #f9f9f9;' + '\n' + ' }' +
'\n' +
'</style>' + '\n' + '</head>' + '\n' + '<body>' + '\n')
htmlreport.write('<p>Run date: ' + str(date.today()) + '</p>')
# summary total
htmlreport.write('<p>Number of <a href="https://dockets.ccb.gov/search/closed?max=100">closed cases</a>: ' +
str(len(closedcasesdatalist)) + '</p>')
# Check to make sure numbers match
print("number of closed cases from initial allclosedcases list: " + str(len(allclosedcases)))
fromdictlist = []
for case in closedcasesdict:
fromdictlist.append(closedcasesdict[case]["Docket No."])
print("number of closed cases from closedcasesdict: " + str(len(closedcasesdict)))
print("Check to see if 107 is back in the closed case docket on CCB site; if so comment out line 249 etc")
if '22-CCB-0107' not in allclosedcases:
allclosedcases.append('22-CCB-0107')
allclosedcases.sort()
# Get cases with scheduling orders, oldest to newest
print("Getting list of cases with scheduling orders (in closedcasepdfs.py)")
res = requests.get('https://dockets.ccb.gov/search/documents?search=&docTypeGroup=type%3A16&sort=submittedDate&order=asc&max=100')
res.raise_for_status()
schedulingordercasesoup = bs4.BeautifulSoup(res.text, 'lxml')
schedulingordercaserows = schedulingordercasesoup.find_all('tr')
caseswithschedulingorders = []
# for some reason this one is grabbing the header rows and the others didn't, so pop the header
schedulingordercaserows.pop(0)
print("Scheduling orders found: " + str(len(schedulingordercaserows)))
for row in schedulingordercaserows:
caseswithschedulingorders.append(getdocketnumindoctypesearch(row))
withordersset = set(caseswithschedulingorders)
caseswithschedulingorders = list(withordersset)
activecases = len(caseswithschedulingorders)
htmlreport.write('<p>Number of <a href="https://dockets.ccb.gov/search/documents?search=&docTypeGroup=type%3A16&max=100"> cases ' +
'where a scheduling order has been filed</a>: ' +
str(activecases) + '</p>')
# table of reasons for dismissal
allreasons = []
for case in allclosedcases:
allreasons.append(closedcasesdict[case]["Tallied reason"])
htmlreport.write('<p>Total number of claims dismissed/closed for each reason</p>')
setofreasons = set(allreasons)
dedupedreasons = list(setofreasons)
dedupedreasons.sort(reverse = True)
htmlreport.write('<table>' + '\n')
for reason in dedupedreasons:
htmlreport.write('<tr>' +
'<td>' + reason + '</td>' +
'<td>' + str(allreasons.count(reason)) + '</td>'
'</tr> \n')
htmlreport.write('</table> \n')
# table of reasons in orders to amend
foreignrespondent = 0
registration = 0
impermissibleclaim = 0
relief = 0
access = 0
similarity = 0
ownership = 0
clarity = 0
for case in otasdict:
if otasdict[case]["Foreign Respondent"] == '1':
foreignrespondent += 1
if otasdict[case]["Registration"] == '1':
registration += 1
if otasdict[case]["Impermissible Claim"] == '1':
impermissibleclaim += 1
if otasdict[case]["Relief Sought"] == '1':
relief += 1
if otasdict[case]["Access"] == '1':
access += 1
if otasdict[case]["Substantial Similarity"] == '1':
similarity += 1
if otasdict[case]["Ownership"] =='1':
ownership += 1
if otasdict[case]["Clarity"] == '1':
clarity += 1
htmlreport.write('<p>Some common problems with claims from orders to amend (Note: more info is available in "otareasons" CSV files in ' +
'<a href="">Google Drive</a>.)</p>')
htmlreport.write('<table>' + '\n' +
'<tr><th>Problem</th><th>Number of orders to amend citing the problem</th></tr>' +
'<tr><td>Claim filed against a foreign respondent</td><td>' + str(foreignrespondent) + '</td></tr>' +
'<tr><td>Infringement claim, lack of copyright registration</td><td>' + str(registration) + '</td></tr>' +
'<tr><td>Impermissible claim, e.g. patent or contract</td><td>' + str(impermissibleclaim) + '</td></tr>' +
'<tr><td>Problem with type or amount of relief sought</td><td>' + str(relief) + '</td></tr>' +
'<tr><td>Infringement claim, insufficient information about Access to allegedly infringed work</td><td>' + str(access) + '</td></tr>' +
'<tr><td>Infringement claim, insufficient allegation of Substantial Similarity</td><td>' + str(similarity) + '</td></tr>' +
'<tr><td>Infringement claim, insufficient allegation of Legal or Beneficial Ownership by claimant</td><td>' + str(ownership) + '</td></tr>' +
'<tr><td>Clarity (about some element, or the claim generally)</td><td>' + str(clarity) + '</td></tr>' +
'</table> \n')
# table of cases
htmlreport.write('<p>Cases</p>')
htmlreport.write('<table>' + '\n' +
'<tr><th>Docket</th><th>Caption</th><th>Orders to amend</th><th>Orders certifying claim</th><th>Opt outs</th>' +
'<th>Inferred reason</th><th>PDF reason</th><th>Tallied reason</th><th>Claimant law firm</th></tr>')
for case in allclosedcases:
htmlreport.write('<tr>' +
'<td>' + '<a href="' + closedcasesdict[case]["Docket URL"] + '">' + case + '</a></td>' +
'<td>' + casedatadict[case]["Caption"] + '</a></td>' +
'<td>' + str(closedcasesdict[case]["Amend orders"]) + '</a></td>' +
'<td>' + str(closedcasesdict[case]["Certifying orders"]) + '</a></td>' +
'<td>' + str(closedcasesdict[case]["Opt outs"]) + '</a></td>' +
'<td>' + closedcasesdict[case]["Inferred reason"] + '</a></td>' +
'<td>' + closedcasesdict[case]["PDF reason"] + '</a></td>' +
'<td>' + closedcasesdict[case]["Tallied reason"] + '</a></td>' +
'<td>' + closedcasesdict[case]["Claimant law firm"] + '</a></td>' +
'</tr> \n')
htmlreport.write('</table> \n')
htmlreport.write('\n' + '</body>' + '\n' + '</html>')
htmlreport.close()
with open('closedcases.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = closedcasesdatalist[0].keys())
writer.writeheader()
writer.writerows(closedcasesdatalist)
csvfile.close()
dedupedoptouts = []
for item in optoutrespondents:
if item not in dedupedoptouts:
dedupedoptouts.append(item)
dedupedoptouts.sort()
dedupedoptouts.insert(0, optoutheaders)
with open("optoutrespondents.csv", "w") as csvfile:
writer = csv.writer(csvfile)
writer.writerows(dedupedoptouts)
csvfile.close()