## 1. Setting up the glue Spark Enviroment

#### [Note]: Open Cloudformation and check SagemakerDeployStack's output, get the value of SagemakerARN(key) and then replace iam_role with it

In [9]:
%session_id_prefix common-crawl-etl
%iam_role {iam_role}

%additional_python_modules warcio,smart_open,goose3,bs4

%idle_timeout 600 
%number_of_workers 25
%worker_type G.2X
%glue_version 4.0

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.38.4 
Setting session ID prefix to common-crawl-etl
Current iam_role is None
iam_role has been set to arn:aws:iam::106839800180:role/SagemakerDeployStack-sagemakerexecutionrole002C13C-16L5I1TTPLRXF.
Additional python modules to be included:
warcio
smart_open
goose3
bs4
Current idle_timeout is 2880 minutes.
idle_timeout has been set to 600 minutes.
Previous number of workers: 5
Setting new number of workers to: 25
Previous worker type: G.1X
Setting new worker type to: G.2X
Setting Glue version to: 4.0


In [None]:
%list_sessions

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder  \
    .appName("warc")  \
    .getOrCreate()

Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::106839800180:role/SagemakerDeployStack-sagemakerexecutionrole002C13C-16L5I1TTPLRXF
Trying to create a Glue session for the kernel.
Worker Type: G.2X
Number of Workers: 25
Session ID: common-crawl-etl-530d10fa-441e-45e3-984a-9ee97b71d07e
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.38.4
--enable-glue-datacatalog true
--additional-python-modules warcio,smart_open,goose3,bs4
Waiting for session common-crawl-etl-530d10fa-441e-45e3-984a-9ee97b71d07e to get into ready status...
Session common-crawl-etl-530d10fa-441e-45e3-984a-9ee97b71d07e has been created.



## 2. Find out all related warc files in CommonCrawl

#### 2.1 Create MetaInfo CCIndex for CommonCrawl Data 

In [None]:
spark.sql("""CREATE DATABASE if not exists ccindex;""")

In [None]:
spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS ccindex.ccindex (
  url_surtkey                   STRING, 
  url                           STRING, 
  url_host_name                 STRING, 
  url_host_tld                  STRING, 
  url_host_2nd_last_part        STRING,
  url_host_3rd_last_part        STRING,
  url_host_4th_last_part        STRING,
  url_host_5th_last_part        STRING,
  url_host_registry_suffix      STRING, 
  url_host_registered_domain    STRING, 
  url_host_private_suffix       STRING,
  url_host_private_domain       STRING, 
  url_protocol                  STRING, 
  url_port                      INT, 
  url_path                      STRING, 
  url_query                     STRING,
  fetch_time                    TIMESTAMP, 
  fetch_status                  SMALLINT,
  content_digest                STRING, 
  content_mime_type             STRING, 
  content_mime_detected         STRING, 
  content_charset               STRING, 
  content_languages             STRING, 
  warc_filename                 STRING, 
  warc_record_offset            INT, 
  warc_record_length            INT, 
  warc_segment                  STRING) 
PARTITIONED BY (
  crawl                         STRING, 
  subset                        STRING)
STORED AS parquet
LOCATION 's3://commoncrawl/cc-index/table/cc-main/warc/';
""")

In [None]:
spark.sql("""MSCK REPAIR TABLE ccindex.ccindex;""")

#### 2.2 filter for the target warc files for specific knowledege 

In [3]:
target_domain = "nbcnews.com"
url_host_domain = "https://www.nbcnews.com/"
topic = 'ufo'
s3_bucket = '106839800180-23-08-03-13-25-48-bucket'




In [4]:
url_pattern = f"{url_host_domain}%{topic}%"

filtered_data = spark.sql(
"""select * from ccindex.ccindex WHERE url_host_registered_domain = '{}' 
and crawl ='CC-MAIN-2023-23' and subset ='warc' and content_mime_detected = 'text/html'
and url like '{}'
""".format(target_domain, url_pattern)
 )




**Note: please be patient,  which may last for few minutes**

In [5]:
filename_list = ["s3://commoncrawl/"+row.warc_filename for row in filtered_data.select("warc_filename").collect()]




#### 2.3 check all related warc files

In [6]:
filename_list

['s3://commoncrawl/crawl-data/CC-MAIN-2023-23/segments/1685224648850.88/warc/CC-MAIN-20230602172755-20230602202755-00314.warc.gz', 's3://commoncrawl/crawl-data/CC-MAIN-2023-23/segments/1685224644309.7/warc/CC-MAIN-20230528150639-20230528180639-00428.warc.gz', 's3://commoncrawl/crawl-data/CC-MAIN-2023-23/segments/1685224656869.87/warc/CC-MAIN-20230609233952-20230610023952-00381.warc.gz', 's3://commoncrawl/crawl-data/CC-MAIN-2023-23/segments/1685224648695.4/warc/CC-MAIN-20230602140602-20230602170602-00361.warc.gz', 's3://commoncrawl/crawl-data/CC-MAIN-2023-23/segments/1685224644867.89/warc/CC-MAIN-20230529141542-20230529171542-00297.warc.gz', 's3://commoncrawl/crawl-data/CC-MAIN-2023-23/segments/1685224645810.57/warc/CC-MAIN-20230530131531-20230530161531-00076.warc.gz', 's3://commoncrawl/crawl-data/CC-MAIN-2023-23/segments/1685224645417.33/warc/CC-MAIN-20230530063958-20230530093958-00756.warc.gz', 's3://commoncrawl/crawl-data/CC-MAIN-2023-23/segments/1685224653608.76/warc/CC-MAIN-2023060

## 3. Parse, extract and clean all these warc Files and write them into S3 bucket

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, udf, col
from pyspark.sql.types import StructType, StructField, StringType, MapType, ArrayType
from smart_open import open
from warcio.archiveiterator import ArchiveIterator
from goose3 import Goose
import pandas as pd
import re
from bs4 import BeautifulSoup

def post_process(content):
    content = content.replace('\r\n', '\n')
    content = re.sub(r'\n+', '\n', content)
    return content

schema = StructType([
    StructField('URL', StringType(), True),
    StructField('MimeType', StringType(), True),
    StructField('RawHTML', StringType(), True),
    StructField('ContentProcessed', StringType(), True),
    StructField('Title', StringType(), True),
    StructField('WARC-Date', StringType(), True),
    StructField('WARC-Record-ID', StringType(), True),
    StructField('Content-Length', StringType(), True),
    StructField('WARC-Filename', StringType(), True),
    StructField('WARC-Warcinfo-ID', StringType(), True),
    StructField('WARC-IP-Address', StringType(), True),
    StructField('WARC-Concurrent-To', StringType(), True),
    StructField('WARC-Payload-Digest', StringType(), True),
    StructField('WARC-Block-Digest', StringType(), True),
    StructField('WARC-Truncated', StringType(), True),
    StructField('WARC-S3Path', StringType(), True)
])

@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def generate_rows(pdf):
    url = pdf['url'][0]
    extractor = Goose()
    stream = open(url, 'rb')
    rows = []
    
    for record in ArchiveIterator(stream):
        page_uri = record.rec_headers.get_header('WARC-Target-URI', "")

        target_url = "https://en.wikipedia.org/wiki/List_of_high-altitude_object_events_in_2023"

        if record.rec_type == 'response' and page_uri.startswith(target_url):
            raw_html = record.content_stream().read()
            html_content = str(raw_html, 'utf-8', errors="ignore")
            article = extractor.extract(raw_html=html_content)
            content_processed = post_process(article.cleaned_text)
            title = article.title
            row = {
                'URL': record.rec_headers.get_header('WARC-Target-URI'),
                'MimeType': record.http_headers.get_header('Content-Type'),
                'RawHTML': raw_html,
                'ContentProcessed': content_processed,
                'Title': title,
                'WARC-Date': record.rec_headers.get_header('WARC-Date'),
                'WARC-Record-ID': record.rec_headers.get_header('WARC-Record-ID'),
                'Content-Length': record.rec_headers.get_header('Content-Length'),
                'WARC-Filename': record.rec_headers.get_header('WARC-Filename'),
                'WARC-Warcinfo-ID': record.rec_headers.get_header('WARC-Warcinfo-ID'),
                'WARC-IP-Address': record.rec_headers.get_header('WARC-IP-Address'),
                'WARC-Concurrent-To': record.rec_headers.get_header('WARC-Concurrent-To'),
                'WARC-Payload-Digest': record.rec_headers.get_header('WARC-Payload-Digest'),
                'WARC-Block-Digest': record.rec_headers.get_header('WARC-Block-Digest'),
                'WARC-Truncated': record.rec_headers.get_header('WARC-Truncated'),
                'WARC-S3Path': url
            }
            rows.append(row)
    return pd.DataFrame(rows)

def clean_with_goose_or_bs(raw_html):
    g = Goose()
    article = g.extract(raw_html=raw_html)
    # if article.cleaned_text:
    #     clean_text = article.cleaned_text
    # else:
    soup = BeautifulSoup(raw_html, 'html.parser')
    clean_text = soup.get_text()
    
    # return re.sub('\s+', ' ', clean_text.replace('\n', '')).strip()
    return clean_text




In [8]:
urls_df = spark.createDataFrame([(url,) for url in filename_list], ["url"])
result_df = urls_df.groupby("url").apply(generate_rows)

clean_udf = udf(clean_with_goose_or_bs, StringType())
result_df = result_df.withColumn("CleanText_Goose_or_BS", clean_udf(result_df["RawHTML"]))



In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def remove_special_characters(input_string):
    # 定义一个正则表达式模式，匹配除了字母、数字、空格之外的任何字符
    pattern = r'[^a-zA-Z0-9\s]'
    
    # 使用 re.sub() 函数将匹配到的特殊字符替换为空字符串
    cleaned_string = re.sub(pattern, '', input_string)
    
    return cleaned_string.replace(' ', '_')

def write_to_s3(title, content, bucket):
    new_title = remove_special_characters(title)
    s3_path = "s3://{}/ai-content/{}.txt".format(bucket, new_title)
    with open(s3_path, 'w') as s3_file:
        s3_file.write(content)
        
    return s3_path




In [10]:
process_row_udf = udf(write_to_s3, StringType())
final_df = result_df.withColumn("s3_path", process_row_udf(col('Title'), col('CleanText_Goose_or_BS'), lit(s3_bucket)))




**Note: please be patient,  which may last for few minutes**

In [None]:
s3_files = final_df.collect()

**Note: After it finish, all content will be ingested into OpenSearch automatically**

In [None]:
spark.stop()