In [1]:
from pyspark.sql.functions import udf
import spacy
import string

nlp = spacy.load('en_core_web_lg')

In [None]:
ROOT = '../dataset/combined'

In [2]:
raw_df = spark.read.json(f'{ROOT}/combined_raw')
raw_df.show()

+--------------------+
|             article|
+--------------------+
|U.S. investment b...|
|Mexican coffee pr...|
|Oil companies are...|
|Adam Dunn … you a...|
|The surprising re...|
|» County’s annual...|
|Dropcam will prob...|
|Wild Insects need...|
|British scientist...|
|The Big Bang Theo...|
|A nurse made the ...|
|Mother-of-three D...|
|The Scots were ki...|
|He may not be com...|
|The wreckage of t...|
|Inter Milan were ...|
|18-year-old Aditi...|
|Tiger Woods grima...|
|Danny Cipriani to...|
|US jobs numbers a...|
+--------------------+
only showing top 20 rows



In [3]:
raw_df.count()

40954102

In [4]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove stopwords and short words with spaCy
    doc = nlp(text)
    text = ' '.join([token.text for token in doc if not token.is_stop and len(token.text) > 2])
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Apply spaCy's language model to generate text embeddings
    doc = nlp(text)
    embeddings = doc.vector.tolist()
    
    return embeddings

In [5]:
preprocess_udf = udf(preprocess_text)
processed_df = raw_df.withColumn('article', preprocess_udf(raw_df['article']))
processed_df.show()

+--------------------+
|             article|
+--------------------+
|[-0.1075509786605...|
|[-0.4619120657444...|
|[0.03337377682328...|
|[-0.2698644399642...|
|[0.41297334432601...|
|[-1.0883550643920...|
|[0.91630727052688...|
|[0.30551484227180...|
|[-0.1121695041656...|
|[0.72363460063934...|
|[-0.1270813345909...|
|[-0.4511327743530...|
|[-0.6705150008201...|
|[-0.5344442725181...|
|[-0.2508292496204...|
|[-1.3492286205291...|
|[-0.9457947611808...|
|[-0.6164143085479...|
|[-0.2500665485858...|
|[-0.2789138257503...|
+--------------------+
only showing top 20 rows



In [None]:
processed_df.write.mode('overwrite').json(f'{ROOT}/combined_processed_spacy')