In [38]:
import requests
import pandas as pd

## Data preparation

1. Create a small sample list of companies, from different sectors.
2. Collect patent information via PatentsViews' APIs, including patent abstract, classification code, and other sector information.

In [33]:
#create a sample list of companies
asg_list = ['Ford motor', 'Toyota', 'Bayerische motoren werke',
           'International business machines', 'Google', 'Apple',
           'Intel Corporation', 'Huawei', 'Ericsson', 'Qualcomm', 'Nokia Corporation',
           'PepsiCo, Inc.', 'The Coca-Cola Company',
           'Shell Oil Company']

In [51]:
#use the api supported by PatentsView
def getURL(id, tag):
    if (tag =='asg'):
        url = 'http://www.patentsview.org/api/assignees/query?q={"_and":[{"patent_type":"utility"},{"_gte":{"patent_date":"2000-01-01"}},{"_lte":{"patent_date":"2019-12-31"}},{"_begins":{"assignee_organization":"'
        url = url + id + '"}}]}&f=["patent_abstract","patent_date", "patent_id"]'
    elif (tag == "cpc"):
        url = 'http://www.patentsview.org/api/patents/query?q={"patent_number":"'
        url = url + id + '"}&f=["patent_id","cpc_group_id","wipo_sector_title","nber_subcategory_title"]'
    return url

In [35]:
def getDataset(url):
    data = requests.get(url).json()
    return data

In [36]:
def parseData(data, asg_name, info_list):
    patent_info = data['assignees']
    for i in range(len(patent_info)):
        patents = patent_info[i]['patents']
        for patn_set in patents:
            pid = patn_set['patent_id']
            if (pid[0].isdigit()):
                patn_abstract = patn_set['patent_abstract']
                patn_date = patn_set['patent_date']
                info_list.append({"pid": pid, "abstract":patn_abstract, "date": patn_date, "name": asg_name})
    return info_list

In [37]:
#get all the patents information for the companies in the list
info_list = []
for asg in asg_list:
    data = getDataset(getURL(asg, 'asg'))
    info_list = parseData(data, asg, info_list)

In [39]:
patent_info_df = pd.DataFrame(info_list)
patent_info_df.to_csv('company_patent_info.tsv', index=False, sep='\t')

In [41]:
patent_info_df.head()

Unnamed: 0,pid,abstract,date,name
0,7963585,A tonneau system for a vehicle bed includes a ...,2011-06-21,Ford motor
1,8277157,The present invention relates to a tie-down de...,2012-10-02,Ford motor
2,8831857,Methods and systems are provided for operating...,2014-09-09,Ford motor
3,8833341,"In one example, a method for controlling a fue...",2014-09-16,Ford motor
4,9732689,Methods and systems are provided for operating...,2017-08-15,Ford motor


In [44]:
#get classification information according to patent IDs
patent_list = list(patent_info_df['pid'])
len(patent_list)

266368

To collect classification information via APIs will take a long time, so we just download the dataset supported by Patentsview, and filter specific cagetories for the patent list.

In [161]:
patent_class_df = pd.DataFrame(columns=['pid'])
patent_class_df['pid'] = patent_info_df['pid'].copy()
patent_class_df.head()

Unnamed: 0,pid
0,7963585
1,8277157
2,8831857
3,8833341
4,9732689


In [155]:
cpc_df = pd.DataFrame(pd.read_csv('../../cpc_current.tsv', sep='\t'))

In [159]:
cpc_df['patent_id'] = cpc_df['patent_id'].apply(lambda x: str(x))

In [163]:
patent_class_df = patent_class_df.merge(cpc_df, left_on='pid', right_on='patent_id', how='inner')

In [166]:
patent_class_df = patent_class_df.drop(columns=['uuid', 'patent_id', 'section_id', 'subsection_id', 'subgroup_id', 'category', 'sequence'], axis=1)

In [178]:
patent_class_df = patent_class_df.rename(columns={'group_id': 'cpc'})

In [179]:
patent_class_df.to_csv('patent_cpc_samples.tsv', index=False, sep='\t')

In [81]:
#preprocess
# wipo_df = pd.DataFrame(pd.read_csv('../../wipo.tsv', sep='\t'))
# nber_df = pd.DataFrame(pd.read_csv('../../nber.tsv', sep='\t'))
# wipo_field_df = pd.DataFrame(pd.read_csv('../../wipo_field.tsv', sep='\t'))
# nber_cg_df = pd.DataFrame(pd.read_csv('../../nber_subcategory.tsv', sep='\t'))
# wipo_df['patent_id'] = wipo_df['patent_id'].apply(lambda x: str(x))
# patent_class_df = pd.DataFrame(columns=['pid'])
# patent_class_df['pid'] = patent_info_df['pid'].copy()
# patent_class_df.head()
# len(patent_class_df)

# patent_class_df = patent_class_df.merge(wipo_df, left_on='pid', right_on='patent_id', how='inner')
# patent_class_df['field_id'] = patent_class_df['field_id'].apply(lambda x: str(x))
# patent_class_df = patent_class_df.merge(wipo_field_df, left_on='field_id', right_on='id', how='inner')
# patent_class_df = patent_class_df.drop(columns=['patent_id', 'sequence', 'id', 'field_id'], axis=1)
# patent_class_df.to_csv('patent_wipo_label.tsv', index=False, sep='\t')