In [43]:
import re, string
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline

In [44]:
from sklearn.datasets import fetch_20newsgroups

In [45]:
#conf = SparkConf().setAppName("TP3 - BD2")
sc = SparkSession.builder.appName("tp3").config("spark.logConf", "true").getOrCreate()

In [46]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Funcoes

https://spark.apache.org/docs/latest/ml-features

In [47]:
def clean_dataset(dataset):
    for data in dataset:
        if data[0] == '\n':
            data[0] = data[0].replace('\n','') 
    return dataset

# Context
This dataset is a collection newsgroup documents. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.

In [48]:
newsgroup_aux = list(zip(dataset.data, dataset.target.tolist()))

In [49]:
newsgroup_aux[:] = [x for x in newsgroup_aux if (len(x[0].replace('\n', '')) > 0)]

In [50]:
df = sc.createDataFrame(newsgroup_aux, schema=['raw_data', 'real_value'])

In [51]:
df = df.withColumn('id', monotonically_increasing_id())

In [52]:
tokenizer = Tokenizer(inputCol = 'raw_data', outputCol = 'words')
#wordsData = tokenizer.transform(sentenceData)

In [53]:
vectorizer = CountVectorizer(inputCol = 'words', outputCol="features", vocabSize=3)

In [54]:
hashLSH = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)

In [55]:
pipeline = Pipeline(stages = [tokenizer, vectorizer, hashLSH])

In [56]:
post_process = pipeline.fit(dataset = df).transform(df)

In [68]:
post_process.select("features").show(10)

+--------------------+
|            features|
+--------------------+
|(3,[0,1,2],[12.0,...|
|(3,[0,1],[19.0,1.0])|
|(3,[0,1,2],[194.0...|
|(3,[0,1,2],[5.0,1...|
|(3,[0,1,2],[14.0,...|
| (3,[0,1],[6.0,1.0])|
|       (3,[0],[3.0])|
|(3,[0,1,2],[11.0,...|
|(3,[0,1,2],[7.0,3...|
|(3,[0,1,2],[11.0,...|
+--------------------+
only showing top 10 rows



## Estrutura do df:
id: bigint

raw_data: string

real_value: bigint

words: array<string>
    
features: vector
    
hashes: array<vector>