# Curso Big Data #12 - Sistema de recomendacion

#### 1. Inicializamos la SparkSession

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NLP').getOrCreate()

#### 2. Loading the dataset

In [3]:
df = spark.read.csv('C:/Users/pc/pruebas/SMSSpamCollection', inferSchema=True, sep='\t')
#renombrar las etiquetas de las columnas
df = df.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
df.show(5)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
+-----+--------------------+
only showing top 5 rows



#### 3. Ingenieria de las caracteristicas

In [5]:
from pyspark.sql.functions import length
df = df.withColumn('length', length(df['text']))
df.show(5)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
+-----+--------------------+------+
only showing top 5 rows



In [6]:
df.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



#### 4. Formateamos las columnas de texto

In [7]:
from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, IDF, VectorAssembler
ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label')
# df_1 = ham_spam_to_numeric.fit(df).transform(df)
# df_1.show()
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
# df_2 = tokenizer.transform(df_1)
# df_2.show()
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='token_stop')
# df_3 = stop_remove.transform(df_2)
# df_3.show()
count_vec = CountVectorizer(inputCol='token_stop', outputCol='count_vec')
# df_4 = count_vec.fit(df_3).transform(df_3)
# df_4.show()

# TF-IDF Term with document y
idf = IDF(inputCol='count_vec', outputCol='tf-idf')
# df_5 = idf.fit(df_4).transform(df_4)
# df_5.show()
transformed_df = VectorAssembler(inputCols=['length', 'tf-idf'], outputCol='features')
# df_6 = transformed_df.transform(df_5)
# df_6.show()

#### 5. Realizamos una pipeline

In [9]:
from pyspark.ml import Pipeline
df_pipe = Pipeline(stages=[ham_spam_to_numeric,
                           tokenizer,
                           stop_remove,
                           count_vec,
                           idf,
                           transformed_df])
final_df = df_pipe.fit(df).transform(df).select('label', 'features')
final_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[0,8,12,32...|
|  0.0|(13424,[0,1,25,29...|
|  1.0|(13424,[0,3,14,20...|
|  0.0|(13424,[0,1,71,81...|
|  0.0|(13424,[0,37,135,...|
+-----+--------------------+
only showing top 5 rows



#### 6. Splitting the dataset

In [10]:
train_data, test_data = final_df.randomSplit([0.7, 0.3])

#### 7. Construimos el modelo ML

In [11]:
from pyspark.ml.classification import NaiveBayes
classifier = NaiveBayes(featuresCol='features', labelCol='label', predictionCol='prediction')
fittied_classifer = classifier.fit(train_data)

#### 8. Predecimos el test data

In [12]:
preds = fittied_classifer.transform(test_data)
preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,3,4...|[-1068.3818480198...|[1.0,4.3911827060...|       0.0|
|  0.0|(13424,[0,1,2,10,...|[-540.15840938280...|[1.0,2.1771371733...|       0.0|
|  0.0|(13424,[0,1,2,12,...|[-868.09511796942...|[1.0,6.2159304457...|       0.0|
|  0.0|(13424,[0,1,2,16,...|[-665.15449447695...|[1.0,2.4829360910...|       0.0|
|  0.0|(13424,[0,1,2,22,...|[-752.02142842782...|[1.0,9.3010665196...|       0.0|
|  0.0|(13424,[0,1,2,24,...|[-1327.0847206418...|[1.0,2.4085604079...|       0.0|
|  0.0|(13424,[0,1,2,32,...|[-342.44358033794...|[1.0,1.3871262760...|       0.0|
|  0.0|(13424,[0,1,2,44,...|[-617.94456184795...|[0.99639187330920...|       0.0|
|  0.0|(13424,[0,1,2,73,...|[-672.90776021646...|[1.0,1.5834862349...|       0.0|
|  0.0|(13424,[0

#### 9. Evaluamos the model

In [14]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label')
area_under_curve = evaluator.evaluate(preds)
accuracy = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='label')
accuracy = accuracy.evaluate(preds)
print('Area bajo la curva:', area_under_curve)
print('Accuracy:', accuracy)

Area bajo la curva: 0.9363118746708794
Accuracy: 0.9206541490006057
