In [1]:
from sklearn.datasets import fetch_20newsgroups
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, MinHashLSH
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as f
import numpy as np
import pandas as pd
from tqdm import tqdm
from functools import reduce

O Scikit-Learn já tem o parser da coleção ``20 NewsGroups`` que já está separado 60% em treino e 40% em teste. Portanto, foi-se usado esse dataset.

In [26]:
proc_data = fetch_20newsgroups(subset='all')

In [27]:
raw_groups = list(zip(proc_data.data, proc_data.target.tolist()))
raw_groups = [[tupla[0],tupla[1]] for tupla in raw_groups]

In [28]:
#raw_groups[:] = [[x[0].replace('\n', '').replace('\t', '').replace(';', '').replace('\r',' '), x[1]] for x in raw_groups]

In [29]:
for i in range(len(raw_groups)):
    raw_groups[i][1] = proc_data.target_names[raw_groups[i][1]]

In [30]:
for i in range(len(raw_groups)):
    raw_groups[i][0] = raw_groups[i][0].replace('\n',' ')
    raw_groups[i][0] = raw_groups[i][0].replace('\t',' ')
    raw_groups[i][0] = raw_groups[i][0].replace('\r',' ')
    raw_groups[i][0] = raw_groups[i][0].replace(';','')

### Salvar treino em um .csv

In [33]:
df_train = pd.DataFrame(raw_groups[:int(len(raw_groups) * 0.85)], columns=['data','target'])

In [139]:
df_train

Unnamed: 0,data,target
0,I am sure some bashers of Pens fans are pret...,rec.sport.hockey
1,My brother is in the market for a high-perform...,comp.sys.ibm.pc.hardware
2,Finally you said what you dream about. Me...,talk.politics.mideast
3,Think! It's the SCSI card doing the DMA tran...,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,comp.sys.mac.hardware
...,...,...
16014,A Unix tool of cryptographic significance is a...,sci.crypt
16015,^^^^^^^^^^^^^^^^^^^^^^^^ ...,rec.sport.hockey
16016,If the Anne Frank exhibit makes it to your ...,alt.atheism
16017,Hi I'm having a problem with TrueType fonts in...,comp.os.ms-windows.misc


In [34]:
df_train.to_csv('newsgroups_train.csv', index=True, sep=';', index_label='id')

### Salvar teste em um .csv

In [35]:
df_test = pd.DataFrame(raw_groups[int(len(raw_groups) * 0.85) + 1:], columns=['data','target'])

In [36]:
df_test.to_csv('newsgroups_test.csv', index=True, sep=';', index_label='id')

### Carregar o .csv em dataframe

In [2]:
spark = SparkSession.builder.appName('Trabalho III').config('spark.some.config.option','some-value').getOrCreate()

In [3]:
sc = spark.sparkContext

# Sem Stopwords

### Treino

In [37]:
data_train = spark.read.load('newsgroups_train.csv', format='csv', sep=';', header=True)
data_train = data_train.fillna({'data':''})

### Teste

In [38]:
data_test = spark.read.load('newsgroups_test.csv', format='csv', sep=';', header=True)
data_test = data_test.where(f.col("data").isNotNull())
data_test = data_test.fillna({'data':''})


In [6]:
data_train.show()

+---+--------------------+--------------------+
| id|                data|              target|
+---+--------------------+--------------------+
|  0|  I am sure some ...|    rec.sport.hockey|
|  1|My brother is in ...|comp.sys.ibm.pc.h...|
|  2|"     Finally you...|talk.politics.mid...|
|  3| Think!  It's the...|comp.sys.ibm.pc.h...|
|  4|1)    I have an o...|comp.sys.mac.hard...|
|  5|  Back in high sc...|     sci.electronics|
|  6|  AE is in Dallas...|comp.sys.mac.hard...|
|  7| [stuff deleted] ...|    rec.sport.hockey|
|  8|   Yeah, it's the...|    rec.sport.hockey|
|  9|" If a Christian ...|  talk.religion.misc|
| 10|the blood of the ...|  talk.religion.misc|
| 11|" >say they have ...|           sci.crypt|
| 12|"930418  Do what ...|  talk.religion.misc|
| 13|" How about Kirli...|             sci.med|
| 14|   There is no no...|         alt.atheism|
| 15|"In the following...|talk.politics.mid...|
| 16|"Many thanks to t...|     sci.electronics|
| 17|......... I, some...|     sci.elect

### Tokenizer

In [7]:
# treino
tokenizer_train = Tokenizer(inputCol='data', outputCol='tokens')
#data_train_no_stopwords = tokenizer.transform(data_train)

# teste
tokenizer_test = Tokenizer(inputCol='data', outputCol='tokens')
#data_test_no_stopwords = tokenizer.transform(data_test)

In [11]:
data_test_no_stopwords.show()

+---+--------------------+--------------------+--------------------+
| id|                data|              target|              tokens|
+---+--------------------+--------------------+--------------------+
|  0|From: v064mb9k@ub...|           rec.autos|[from:, v064mb9k@...|
|  1|"From: Rick Mille...|      comp.windows.x|["from:, rick, mi...|
|  2|From: mathew <mat...|         alt.atheism|[from:, mathew, <...|
|  3|"From: bakken@cs....|talk.politics.mid...|["from:, bakken@c...|
|  4|"From: livesey@so...|  talk.religion.misc|["from:, livesey@...|
|  5|"From: banschbach...|             sci.med|["from:, banschba...|
|  6|From: PETCH@gvg47...|soc.religion.chri...|[from:, petch@gvg...|
|  7|"From: fortmann@s...|soc.religion.chri...|["from:, fortmann...|
|  8|From: kartik@hls....|      comp.windows.x|[from:, kartik@hl...|
|  9|From: tmc@spartan...|       comp.graphics|[from:, tmc@spart...|
| 10|From: Greg.Reinac...|comp.os.ms-window...|[from:, greg.rein...|
| 11|From: sirosh@cs.u...|      co

### Count Vectorizer

In [8]:
#treino
cv_train = CountVectorizer(inputCol='tokens',outputCol='rawFeatures', vocabSize=130110)
#cvmodel = cv.fit(data_train_no_stopwords)
#data_train_no_stopwords = cvmodel.transform(data_train_no_stopwords)

# teste
cv_test = CountVectorizer(inputCol='tokens',outputCol='rawFeatures', vocabSize=130110)
#cvmodel = cv.fit(data_test_no_stopwords)
#data_test_no_stopwords = cvmodel.transform(data_test_no_stopwords)

In [107]:
data_test_no_stopwords.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|                data|              target|              tokens|         rawFeatures|            features|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|From: v064mb9k@ub...|           rec.autos|[from:, v064mb9k@...|(130110,[0,1,2,3,...|(130110,[0,1,2,3,...|
|  1|"From: Rick Mille...|      comp.windows.x|["from:, rick, mi...|(130110,[0,1,2,3,...|(130110,[0,1,2,3,...|
|  2|From: mathew <mat...|         alt.atheism|[from:, mathew, <...|(130110,[0,1,2,3,...|(130110,[0,1,2,3,...|
|  3|"From: bakken@cs....|talk.politics.mid...|["from:, bakken@c...|(130110,[0,1,2,3,...|(130110,[0,1,2,3,...|
|  4|"From: livesey@so...|  talk.religion.misc|["from:, livesey@...|(130110,[0,2,3,6,...|(130110,[0,2,3,6,...|
|  5|"From: banschbach...|             sci.med|["from:, banschba...|(130110,[0,1,2,3,...|(130110,[0,1,2,3,...|
|

### IDF

In [9]:
# treino
idf_train = IDF(inputCol='rawFeatures',outputCol='features')
#idfModel = idf.fit(data_train_no_stopwords)
#data_train_no_stopwords = idfModel.transform(data_train_no_stopwords)

# teste
idf_test = IDF(inputCol='rawFeatures',outputCol='features')
#idfModel = idf.fit(data_test_no_stopwords)
#data_test_no_stopwords = idfModel.transform(data_test_no_stopwords)

### MinHashLSH

In [39]:
pipeline_train = Pipeline(stages=[tokenizer_train, cv_train, idf_train])

In [40]:
pipeline_model = pipeline_train.fit(data_train)

In [41]:
df_train = pipeline_model.transform(data_train)

In [19]:
df_train

DataFrame[id: string, data: string, target: string, tokens: array<string>, rawFeatures: vector, features: vector]

In [42]:
pipeline_test = Pipeline(stages=[tokenizer_test, cv_test, idf_test])

In [43]:
pipeline_model = pipeline_test.fit(data_test)

In [44]:
df_test = pipeline_model.transform(data_test)

In [23]:
df_test.show()

+---+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|                data|              target|              tokens|         rawFeatures|            features|
+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0|"I am using the G...|      comp.windows.x|["i, am, using, t...|(69649,[0,1,2,3,4...|(69649,[0,1,2,3,4...|
|  1|   There's docume...|      comp.windows.x|[, , , there's, d...|(69649,[0,1,2,3,4...|(69649,[0,1,2,3,4...|
|  2|"The only reason ...|soc.religion.chri...|["the, only, reas...|(69649,[0,1,2,3,4...|(69649,[0,1,2,3,4...|
|  3|" Steve,      It'...|  talk.politics.guns|[", steve,, , , ,...|(69649,[0,1,2,3,4...|(69649,[0,1,2,3,4...|
|  4|"I have some bran...|        misc.forsale|["i, have, some, ...|(69649,[0,1,2,3,4...|(69649,[0,1,2,3,4...|
|  5|  I wonder how ha...|comp.sys.mac.hard...|[, , i, wonder, h...|(69649,[0,1,2,3,4...|(69649,[0,1,2,3,4...|
|

In [45]:
# somente necessário para o treino
mh_train = MinHashLSH(inputCol='features', outputCol='hashes', numHashTables=10)
model = mh_train.fit(df_train)

In [None]:
collect_features = df_test.select('features').collect()
predict_list = []

for i in tqdm(range(df_test.count())):
    #model.approxNearestNeighbors(data_train_no_stopwords, collect_features[i][0], 5).cache()
    df_predict = model.approxNearestNeighbors(df_train, collect_features[i][0], 5)
    select_target = df_predict.select('target').take(1)
    predict_list.append(select_target)





  0%|          | 0/2826 [00:00<?, ?it/s][A[A[A[A



  0%|          | 1/2826 [00:01<1:09:01,  1.47s/it][A[A[A[A



  0%|          | 2/2826 [00:02<1:09:40,  1.48s/it][A[A[A[A



  0%|          | 3/2826 [00:04<1:14:19,  1.58s/it][A[A[A[A



  0%|          | 4/2826 [00:06<1:12:01,  1.53s/it][A[A[A[A



  0%|          | 5/2826 [00:07<1:12:12,  1.54s/it][A[A[A[A



  0%|          | 6/2826 [00:09<1:19:00,  1.68s/it][A[A[A[A



  0%|          | 7/2826 [00:12<1:31:45,  1.95s/it][A[A[A[A



  0%|          | 8/2826 [00:14<1:40:37,  2.14s/it][A[A[A[A



  0%|          | 9/2826 [00:17<1:42:51,  2.19s/it][A[A[A[A



  0%|          | 10/2826 [00:19<1:42:42,  2.19s/it][A[A[A[A



  0%|          | 11/2826 [00:22<1:50:39,  2.36s/it][A[A[A[A



  0%|          | 12/2826 [00:24<1:48:41,  2.32s/it][A[A[A[A



  0%|          | 13/2826 [00:26<1:46:39,  2.28s/it][A[A[A[A



  0%|          | 14/2826 [00:29<1:50:19,  2.35s/it][A[A[A[A



  1%|      