In [None]:
import re, string
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
#conf = SparkConf().setAppName("TP3 - BD2")
sc = SparkSession.builder.appName("tp3").config("spark.logConf", "true").getOrCreate()

In [None]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Funcoes

https://spark.apache.org/docs/latest/ml-features

In [None]:
def clean_dataset(dataset):
    for data in dataset:
        if data[0] == '\n':
            data[0] = data[0].replace('\n','') 
    return dataset

In [None]:
#https://stackoverflow.com/questions/46975929/how-can-i-calculate-the-jaccard-similarity-of-two-lists-containing-strings-in-py
def jacDistance(set1, set2):
    s1 = set(set1)
    s2 = set(set2)
    return len(s1.intersection(s2)) / float(len(s1.union(s2)))

# Context
This dataset is a collection newsgroup documents. The 20 newsgroups collection has become a popular data set for experiments in text applications of machine learning techniques, such as text classification and text clustering.

In [None]:
newsgroup_aux = list(zip(dataset.data, dataset.target.tolist()))

In [None]:
newsgroup_aux[:] = [x for x in newsgroup_aux if (len(x[0].replace('\n', '')) > 0)]

In [None]:
df = sc.createDataFrame(newsgroup_aux, schema=['raw_data', 'real_value'])

In [None]:
df = df.withColumn('id', monotonically_increasing_id())

In [None]:
tokenizer = Tokenizer(inputCol = 'raw_data', outputCol = 'words')
#wordsData = tokenizer.transform(sentenceData)

In [None]:
vectorizer = CountVectorizer(inputCol = 'words', outputCol='features', vocabSize=3)

In [None]:
hashLSH = MinHashLSH(inputCol='features', outputCol='hashes', numHashTables=5)

In [None]:
pipeline = Pipeline(stages = [tokenizer, vectorizer, hashLSH])

In [None]:
post_process = pipeline.fit(dataset = df).transform(df)

In [None]:
post_process.select('features').show(10)

## Estrutura do df:
id: bigint

raw_data: string

real_value: bigint

words: array<string>
    
features: vector
    
hashes: array<vector>

A ideia aqui de usar a distancia de Jaccard eh tentar descobrir a similaridade entre dois sets e poupar o nosso KNN de amostras muito diferentes

In [None]:
selected_data = post_process.select('id', 'real_value', 'features', 'hashes')

In [None]:
X_train, X_test = selected_data.randomSplit([7.0, 3.0], 7)

In [None]:
X_train.collect()