# INSTALL REQUIRED PACKAGE

In [1]:
! pip3 install --quiet --upgrade pandas-gbq 'google-cloud-bigquery[bqstorage,pandas]' scikit-learn
%pip install underthesea

Note: you may need to restart the kernel to use updated packages.


## Set project id 

In [2]:
PROJECT_ID = "intern-project-415606"  # @param {type:"string"}

# set the project id
! gcloud config set project $PROJECT_ID

Updated property [core/project].


## Load data from bigquery

In [3]:
from google.cloud import bigquery

# Create a BigQuery client
client = bigquery.Client()

# Define your BigQuery query to extract data
query = """
    SELECT *
    FROM intern-project-415606.Criminal_Dataset.criminal_data
"""

# Run the query and get the result
query_job = client.query(query)

# Convert the result into a Pandas DataFrame
df = query_job.to_dataframe()

In [4]:
df.head()

Unnamed: 0,JLR_LINK,TRANS_TYPE_OF_CASE,TRANS_LEGAL_RELATIONSHIP,PDF_TEXT,EXTRACT,ID,NAME,Year,Month,Day,GENDER,BIRTH
0,https://congbobanan.toaan.gov.vn/2ta827827t1cv...,採取行政處理措施的決定,放入強制性戒毒機構,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,Họ và tên: Nguyễn Văn C – Giới tính: Nam Sinh...,No_Id,Nguyễn Văn C,1994,1,25,Male,1994-01-25 00:00:00
1,https://congbobanan.toaan.gov.vn/2ta889668t1cv...,刑事,犯罪非法持有毒品,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,"Họ và tên: Trần Ngọc T, sinh ngày 08/12/1993....",No_Id,Trần Ngọc T,1993,12,8,Male,1993-12-08 00:00:00
2,https://congbobanan.toaan.gov.vn/2ta153804t1cv...,刑事,盜竊財產罪,<Page:1>TOÀ ÁN NHÂN DÂN CỘNG HOÀ XÃ HỘI CHỦ NG...,"Bùi Văn T, sinh ngày 09/12/1989 Tại: Xóm T – ...",No_Id,Bùi Văn T,1989,12,9,Male,1989-12-09 00:00:00
3,https://congbobanan.toaan.gov.vn/2ta1126341t1c...,刑事,犯罪非法持有毒品,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,"Nguyễn Văn Đ, sinh ngày 04/8/1978 tại huyện H...",No_Id,Nguyễn Văn Đ,1978,8,4,Male,1978-08-04 00:00:00
4,https://congbobanan.toaan.gov.vn/2ta1066372t1c...,刑事,欺詐佔有財產罪,<Page:1>TÒA ÁN NHÂN DÂN CỘNG HÒA XÃ HỘI CHỦ NG...,"Cao Thị B, sinh ngày 01/01/1975, tại huyện C,...",No_Id,Cao Thị B,1975,1,1,Female,1975-01-01 00:00:00


## Build Table

In [5]:
# Specify your Google Cloud Platform project ID
project_id = 'intern-project-415606'

# Specify the dataset ID
dataset_id = 'Criminal_Dataset'

# Specify the table ID
table_id = 'criminal_data_ner'

# Specify schema
schema = [
    bigquery.SchemaField("extract_id", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("text", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("ner_underthesea", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("tag_underthesea", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("self_label", "STRING", mode="REQUIRED")
]

# Create a BigQuery table
table = bigquery.Table(f"{project_id}.{dataset_id}.{table_id}", schema=schema)

# Send the table creation request to the BigQuery API
try:
    table = client.create_table(table)  # API request
    print(f"Table {table_id} created successfully.")
except Exception as e:
    print(f"Error creating table: {e}")

Table criminal_data_ner created successfully.


## NER, and label name

In [8]:
from underthesea import ner
# Specify your Google Cloud Platform project ID
project_id = 'intern-project-415606'

# Specify the dataset ID
dataset_id = 'Criminal_Dataset'

# Specify the table ID
table_id = 'criminal_data_ner'
# carry out ner and insert into bigquery
for i in range(0, 1000):
    ner_result = ner(df['EXTRACT'][i])

    rows = []
    for entity in ner_result:
        row = {
            "extract_id": str(i),  # You can use the index 'i' as the ID for each sentence
            "text": entity[0],
            "ner_underthesea": entity[3],
            "tag_underthesea": entity[1],
            "self_label": 'other'
        }
        # self label provided name as N
        if df['NAME'][i].strip() == row['text'].strip():
            row['self_label'] = 'N'
        # self label numeric data as M
        elif row['tag_underthesea'] == 'M' or row['text'].count('/') == 2:
            row['self_label'] = 'M'
        rows.append(row)
        
    # for row in rows:
    #     print(row)
    # Insert each row into BigQuery
    errors = client.insert_rows_json(f"{project_id}.{dataset_id}.{table_id}", rows)

    if errors == []:
        print(f"Inserted successfully for sentence {i}.")
    else:
        print(f"Errors encountered while inserting rows for sentence {i}: {errors}")

Inserted successfully for sentence 0.
Inserted successfully for sentence 1.
Inserted successfully for sentence 2.
Inserted successfully for sentence 3.
Inserted successfully for sentence 4.
Inserted successfully for sentence 5.
Inserted successfully for sentence 6.
Inserted successfully for sentence 7.
Inserted successfully for sentence 8.
Inserted successfully for sentence 9.
Inserted successfully for sentence 10.
Inserted successfully for sentence 11.
Inserted successfully for sentence 12.
Inserted successfully for sentence 13.
Inserted successfully for sentence 14.
Inserted successfully for sentence 15.
Inserted successfully for sentence 16.
Inserted successfully for sentence 17.
Inserted successfully for sentence 18.
Inserted successfully for sentence 19.
Inserted successfully for sentence 20.
Inserted successfully for sentence 21.
Inserted successfully for sentence 22.
Inserted successfully for sentence 23.
Inserted successfully for sentence 24.
Inserted successfully for sentence 

In [None]:
start_time = 4:33pm
end_time = 5:17pm

In [10]:
query = """
    SELECT *
    FROM intern-project-415606.Criminal_Dataset.criminal_data_ner
    WHERE self_label = 'M' or self_label = 'N'
"""
# Run the query and get the result
query_job = client.query(query)

# Convert the result into a Pandas DataFrame
query_result = query_job.to_dataframe()

# Print the first few rows of the DataFrame
query_result.head(n=20)

Unnamed: 0,extract_id,text,ner_underthesea,tag_underthesea,self_label
0,0,Nguyễn Văn C,B-PER,Np,N
1,0,25/01/1994,O,N,M
2,0,12,I-LOC,M,M
3,1,Trần Ngọc T,B-PER,Np,N
4,1,08/12/1993,I-LOC,M,M
5,1,x1,O,M,M
6,1,28/7/2021,I-LOC,M,M
7,2,Bùi Văn T,B-PER,Np,N
8,2,09/12/1989,I-LOC,M,M
9,2,1964,I-LOC,M,M
