/
otareasons.py
161 lines (151 loc) · 6.29 KB
/
otareasons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import requests, bs4, re, csv, PyPDF2
# get document number, url, date filed, docket number, and docket url for each document row
def getorderinfo(row):
info = []
cells = row.find_all('td')
commanum = cells[1].get_text(strip=True)
documentnum = commanum.replace(",", "")
info.append(documentnum)
documenturl = "https://dockets.ccb.gov/document/download/" + documentnum
info.append(documenturl)
datefiledlong = str(cells[5].get_text(strip=True))
datefiled = datefiledlong[:10]
info.append(datefiled)
docketnum = str(cells[0].get_text(strip=True))
info.append(docketnum)
docketurl = "https://dockets.ccb.gov/case/detail/" + docketnum
info.append(docketurl)
return info
def getpdftext(filename):
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
ordertext = pageObj.extractText()
for pageNum in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
ordertext = ordertext + pageObj.extractText()
return ordertext
# Import the data from the last time we ran this script
print("Importing OTA info from CSV")
otareasonscsv = open('otareasons.csv', 'r')
reader = csv.DictReader(otareasonscsv)
otasdict = {}
for dictionary in reader:
documentnum = dictionary["Document No."]
otasdict[documentnum] = dictionary
otareasonscsv.close()
# Get list of orders we had as of the last run
orderswehave = []
for order in otasdict:
orderswehave.append(order)
ordersweneed = []
# Get the first 100 orders to amend
print("Getting list of all orders to amend")
res = requests.get('https://dockets.ccb.gov/search/documents?search=&docTypeGroup=type%3A52&max=100')
res.raise_for_status()
amendlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
amendtablerows = amendlistsoup.tbody.find_all('tr')
# Get the 2nd 100
res = requests.get('https://dockets.ccb.gov/search/documents?search=&docTypeGroup=type%3A52&offset=100&max=100')
res.raise_for_status()
amendlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
amendtablerows.extend(amendlistsoup.tbody.find_all('tr'))
# Get the 3rd 100
res = requests.get('https://dockets.ccb.gov/search/documents?search=&docTypeGroup=type%3A52&offset=200&max=100')
res.raise_for_status()
amendlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
amendtablerows.extend(amendlistsoup.tbody.find_all('tr'))
# Get the 4th 100
res = requests.get('https://dockets.ccb.gov/search/documents?search=&docTypeGroup=type%3A52&offset=300&max=100')
res.raise_for_status()
amendlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
amendtablerows.extend(amendlistsoup.tbody.find_all('tr'))
# Get the 5th 100
res = requests.get('https://dockets.ccb.gov/search/documents?search=&docTypeGroup=type%3A52&offset=400&max=100')
res.raise_for_status()
amendlistsoup = bs4.BeautifulSoup(res.text, 'lxml')
amendtablerows.extend(amendlistsoup.tbody.find_all('tr'))
print('Total number of OTAs found: ' + str(len(amendtablerows)))
# Get the basic info for each order to amend
print("Getting basic info for each order")
for row in amendtablerows:
thisotadict = {}
otainfo = getorderinfo(row)
documentnum = otainfo[0]
if documentnum not in orderswehave:
ordersweneed.append(documentnum)
thisotadict["Document No."] = documentnum
thisotadict["Document URL"] = otainfo[1]
thisotadict["Date filed"] = otainfo[2]
thisotadict["Docket No."] = otainfo[3]
thisotadict["Docket URL"] = otainfo[4]
thisotadict["PDF filename"] = 'pdfs/ota' + documentnum + '.pdf'
otasdict[documentnum] = thisotadict
# Save PDFs locally
print("Saving PDFs locally")
for order in ordersweneed:
print(order)
otapdfurl = otasdict[order]["Document URL"]
res = requests.get(otapdfurl)
res.raise_for_status()
pdffile = open('pdfs/ota' + order + '.pdf', 'wb')
for chunk in res.iter_content(100000):
pdffile.write(chunk)
pdffile.close()
# Get text from PDFs
print("Checking for reasons in PDF text")
for order in ordersweneed:
filename = otasdict[order]["PDF filename"]
print('Getting text of ' + filename)
pdftext = getpdftext(filename)
pdftext = pdftext.replace(' ', '')
otasdict[order]["Foreign Respondent"] = 0
otasdict[order]["Gov Entity Respondent"] = 0
otasdict[order]["OSP"] = 0
otasdict[order]["Impermissible Claim"] = 0
otasdict[order]["Relief Sought"] = 0
otasdict[order]["Improper Pleading Form"] = 0
otasdict[order]["Clarity"] = 0
otasdict[order]["Access"] = 0
otasdict[order]["Ownership"] = 0
otasdict[order]["Registration"] = 0
otasdict[order]["Substantial Similarity"] = 0
otasdict[order]["Misrep - False Statement"] = 0
otasdict[order]["Misrep - No Notice Sent"] = 0
otasdict[order]["Noninfringement - No Accusation"] = 0
if "ForeignRespondent" in pdftext:
otasdict[order]["Foreign Respondent"] = 1
if "FederalorState" in pdftext:
otasdict[order]["Gov Entity Respondent"] = 1
if "OnlineServiceProvider" in pdftext:
otasdict[order]["OSP"] = 1
if "ermissibleClaim" in pdftext:
otasdict[order]["Impermissible Claim"] = 1
if "ReliefSought" in pdftext or "PermissibleRemedies" in pdftext or "ReliefRequested" in pdftext:
otasdict[order]["Relief Sought"] = 1
if "ImproperPleadingForm" in pdftext:
otasdict[order]["Improper Pleading Form"] = 1
if "Clarity" in pdftext:
otasdict[order]["Clarity"] = 1
if "Access" in pdftext:
otasdict[order]["Access"] = 1
if "BeneficialOwner" in pdftext or "CopyrightOwnership" in pdftext:
otasdict[order]["Ownership"] = 1
if "Registration" in pdftext:
otasdict[order]["Registration"] = 1
if "SubstantialSimilarity" in pdftext:
otasdict[order]["Substantial Similarity"] = 1
if "FalseStatement" in pdftext:
otasdict[order]["Misrep - False Statement"] = 1
if "NoDMCA" in pdftext:
otasdict[order]["Misrep - No Notice Sent"] = 1
if "NoAccusationbyRespondent" in pdftext:
otasdict[order]["Noninfringement - No Accusation"] = 1
# Output dictionaries as csv
otaslist = [value for value in otasdict.values()]
otaslist.sort(key=lambda x: x["Document No."])
with open('otareasons.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames = otaslist[0].keys())
writer.writeheader()
writer.writerows(otaslist)
csvfile.close()