In [1]:
#Buscando Wiki data :)
display(dbutils.fs.ls("dbfs:///databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/"))

path,name,size
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2-part-00000,enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2-part-00000,168870620
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2-part-00000,enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2-part-00000,1121580271
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles11.xml-p001325001p001825000.bz2-part-00000,enwiki-latest-pages-articles11.xml-p001325001p001825000.bz2-part-00000,1169954684
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles12.xml-p001825001p002425000.bz2-part-00000,enwiki-latest-pages-articles12.xml-p001825001p002425000.bz2-part-00000,1287433013
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles13.xml-p002425002p003124997.bz2-part-00000,enwiki-latest-pages-articles13.xml-p002425002p003124997.bz2-part-00000,1231144795
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles14.xml-p003125001p003924999.bz2-part-00000,enwiki-latest-pages-articles14.xml-p003125001p003924999.bz2-part-00000,1238353174
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles15.xml-p003925001p004824998.bz2-part-00000,enwiki-latest-pages-articles15.xml-p003925001p004824998.bz2-part-00000,1204996932
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles16.xml-p004825005p006024996.bz2-part-00000,enwiki-latest-pages-articles16.xml-p004825005p006024996.bz2-part-00000,1487427360
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles17.xml-p006025001p007524997.bz2-part-00000,enwiki-latest-pages-articles17.xml-p006025001p007524997.bz2-part-00000,1561578307
dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles18.xml-p007525004p009225000.bz2-part-00000,enwiki-latest-pages-articles18.xml-p007525004p009225000.bz2-part-00000,1636217321


In [2]:
#para visualizar los datos, hay que crear un RDD a partir del XML
data = sc.textFile("dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2-part-00000")
data.take(2)

In [3]:
%sql
drop table wiki2

In [4]:
%sql
CREATE TABLE wiki2 (id string, text string)
USING com.databricks.spark.xml
OPTIONS (path "dbfs:/databricks-datasets/wikipedia-datasets/data-001/en_wikipedia/perline-xml/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2-part-00000", rowTag "revision")

In [5]:
wiki_df = spark.sql("select id, text from wiki2") #creación de dataFrame

In [6]:
#Creamos un DataFrame usando la tabla wiki2
wiki_df = spark.sql("select id, text from wiki2")
wiki_df.show()

In [7]:
#1. Convertimos todo el texto a minúscula y elimminamos caracteres especiales
from pyspark.sql.functions import lower, regexp_replace
wiki_lower = wiki_df.select('id', (lower(
  regexp_replace('text', "[^a-zA-Z\\s]", " ")).alias('text')))
wiki_lower.show()

In [8]:
#2. Removemos espacios en blanco
import pyspark.sql.functions as F
from pyspark.sql.functions import col

def remove_all_whitespace(col):
    return F.regexp_replace(col, "\\s+", " ")
wiki_espacios = wiki_lower.withColumn("text_sin_esp",remove_all_whitespace(col("text")))
wiki_espacios.show()

In [9]:
#3. Tokenizamos el texto
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='text_sin_esp', outputCol='text_token')
wiki_token = tokenizer.transform(wiki_espacios).select('id', 'text_token')
wiki_token.show()

In [10]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover()
stopwords = remover.getStopWords()
stopwords[:] #Revisamos la lista actual de Stopwords

In [11]:
#4. Eliminamos stopwords
from pyspark.ml.feature import StopWordsRemover
stopwordList = ["redirect","uses", "http", "title", "www", "date", "web", "org","name","r","camelcase"] 
remover = StopWordsRemover(inputCol="text_token", outputCol="filtered", stopWords=stopwordList)
wiki_stop = remover.transform(wiki_token)
wiki_stop.show()

In [12]:
temp_table_name = "wiki_cleaned"
wiki_stop.createOrReplaceTempView(temp_table_name)

In [13]:
%sql
select count(*) from wiki_cleaned

count(1)
6269


In [14]:
%sql
select id,filtered from wiki_cleaned limit 1

id,filtered
631144794,"List(, computer, accessibility, redr, move, from, up)"


In [15]:
#Iniciamos con la función CountVectorizer
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="filtered", outputCol="rawFeatures", vocabSize = 10000)
model = cv.fit(wiki_stop)
result = model.transform(wiki_stop)
result.show()

In [16]:
#Calculamos el TF-IDF
from pyspark.ml.feature import IDF
vocab_broadcast = sc.broadcast(vocab)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(result)
result_tfidf = idfModel.transform(result) # TFIDF
result_tfidf.show()

In [17]:
from pyspark.ml.feature import HashingTF
ht = HashingTF(inputCol="filtered", outputCol="rawFeaturesTF")
result = ht.transform(wiki_stop)
result.show()