In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
sc = pyspark.SparkContext(appName = "spark_ml")

## Clustering: Kmeans (example)

In [6]:
from pyspark.mllib.clustering import KMeans, KMeansModel
import numpy as np
from math import sqrt

In [8]:
# Load and parse the data
data = sc.textFile("./datasets/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(" ")]))

In [10]:
# Build the model (cluster the data)
clusters = KMEans.train(parsedData, 2, maxIterations=10, runs=10, initializationMode="random")

In [None]:
# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in point-center]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x,y: x+y)

print("Within Set Sum of Squared Errors = ", WSSSE)

In [None]:
# Save and load model
clusters.save(sc, "myModelPath")

## Classification: DecisionTree (example)

In [11]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

In [None]:
# Load and parse the data file into an RDD of LabeledPoint
data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [None]:
# Train
# Empty categoricalFeaturesInfo indicates all features are continuous
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32)


In [None]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda Ip: Ip.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda v,p: v!=p).count()/float(testData.count())

print('Test Error =', testErr)
print('Learned classification tree model:', model.toDebugString())

In [None]:
# Save and load model
model.save(sc, "myModelPath")

sameModel = DecisionTreeModel.load(sc, "myModelPath")

## Procesamiento iterativo con Spark: K-means

In [20]:
data = sc.textFile("/tmp/curso/devicestatus.txt")
data.take(1)

['2014-03-15:10:10:20|Sorrento F41L|8cc3b47e-bd01-4482-b500-28f2342679af|7|24|39|enabled|disabled|connected|55|67|12|33.6894754264|-117.543308253']

In [39]:
points = data.map(lambda x: (float(x.split("|")[12]), float(x.split("|")[13]))).filter(lambda x: x != (0,0))
points.cache() # save in cache as will use many times
points.take(1)

[(33.6894754264, -117.543308253)]

In [27]:
# for a point p and an array of points, return the index in the array of the point closest to p
def closestPoint(p, points):
    bestIndex = 0
    closest = float("+inf")
    # for each point in the array, calculate the distance to the test point, then return
    # the index of the array point with the smallest distance
    for i in range(len(points)):
        dist = distanceSquared(p,points[i])
        if dist < closest:
            closest = dist
            bestIndex = i
    return bestIndex

# The squared distances between two points
def distanceSquared(p1,p2):  
    return (p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2

# The sum of two points
def addPoints(p1,p2):
    return [p1[0] + p2[0], p1[1] + p2[1]]

In [29]:
K = 5 # number of means (center points of clusters ) to find
convergeDist = 0.1 # the threshold "distance" between iterations at which we decide we are done

In [71]:
# starting points
# takeSample(withReplacement, num, seed = None)
kPoints = loc.takeSample(False, K, 42)
kPoints

[(34.0830381107, -117.960562808),
 (34.2480006224, -117.931551969),
 (34.3719071909, -117.850561452),
 (38.4399201611, -121.019109788),
 (33.6093366014, -111.769407277)]

In [72]:
dist = float("+inf")
while dist > convergeDist:
    # for each point, find the index of the closest kpoint.  map to (index, (point,1))
    closest = points.map(lambda point : (closestPoint(point, kPoints), (point, 1)))
    # for each key (k-point index), reduce by adding the coordinates and number of points
    point_stats = closest.reduceByKey(lambda accum, n: (addPoints(accum[0],n[0]),accum[1]+n[1]))
    # for each key (k-point index), find a new point by calculating the average of each closest point
    newPoints = point_stats.map(lambda x: (x[0],[x[1][0][0]/x[1][1],x[1][0][1]/x[1][1]])).collect()
    # calculate the total of the distance between the current points and new points
    dist=0
    for i, point in newPoints: 
        dist += distanceSquared(kPoints[i],point)
    print("Distance between iterations:", dist)
    # Copy the new points to the kPoints array for the next iteration
    for i, point in newPoints: 
        kPoints[i] = point
        
print("Final center points: ",kPoints)

Distance between iterations: 4.168687670254237
Distance between iterations: 3.745890322578019
Distance between iterations: 1.666217245297977
Distance between iterations: 0.5097386038620183
Distance between iterations: 0.8317221662820514
Distance between iterations: 1.5694681458063235
Distance between iterations: 2.3535047857216482
Distance between iterations: 0.9390267476897196
Distance between iterations: 0.06568718669542292
Final center points:  [[34.272325629615345, -117.8304037284927], [38.056132558472484, -121.205019936655], [36.70143101357855, -114.65485840414931], [43.924776157124676, -121.37436192818824], [33.687196560692406, -111.04680557861003]]


## Spark MLlib RDD-based API (to be deprecated in Spark 3.0) vs Spark MLlib DataFrame-based API

##### Sample pipeline and evaluation:

In [2]:
# MLlib RDD-based API (Spark MLlib)
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics
obs = sc.parallelize(
    [LabeledPoint(1.0, [1.0, 0.0, 10.0])]), ...,
     LabeledPoint(1.0, [1.0, 0.0, -0.5])])
training, test = obs.randomSplit([0.6,0.4]) # random
# regression
model = LogisticRegressionWithSGD.train(training)
valuesAndPreds = test.map(lambda p: (float(model.predict(p.features)), p.label))
metrics = RegressionMetrics(valuesAndPreds)
print("MAE = ", metrics.meanAbsoluteError)
print("MSE = ", metrics.meanSquaredError)
print("RMSE = ", metrics.rootMeanSquaredError)
print("R-squared = ", metrics.r2)
print("Explained variance = ", metrics.explainedVariance)
# classification
model = LogisticRegressionWithLBFGS.train(training)
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
 # binary classification metrics
metrics = BinaryClassificationMetrics(predictionAndLabels)
print("AUC CP = ", metrics.areaUnderPR)
print("AUC ROC = ", metrics.areaUnderROC)
 # accuracy
tests = predictionAndLabels.map(lambda pair:pair[0]==pair[1]).collect()
nTests = len(tests)
nHits = tests.count(True)
accuracy = float(nHits)/nTests
print("Accuracy = ", accuracy)
 # multiclass metrics
metrics = MulticlassMetrics(predictionAndLabels)
print("Precision = ", metrics.precision(1.0))
print("Recall = ", metrics.recall(1.0))
print("F1 = ", metrics.fMeasure(1.0, beta=1.0))
newobs = [1.0, 0.0, 10.0]
model.predict(newobs)

# MLlib DataFrame-based API (Spark Ml)
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
obsdf = sqlContext.createDataFrame([
    (1.0, Vectors.dense([1.0, 0.0, 10.0])), ...
    (1.0, Vectors.dense([1.0, 0.0, -0.5]))
    ], ["label", "features"])
training, test = obsdf.randomSplit([0.6, 0.4])
# regression
lr = LogisticRegression()
lrModel = lr.fit(training)
predictedobsdf = lrModel.transform(test)
predictionAndLabels = predictedobsdf.select("prediction", "label")
evaluator = RegressionEvaluator()
MAE = evaluator.evaluate(predictionAndLabels, {evaluator.metricname: "mae"})
MSE = evaluator.evaluate(predictionAndLabels, {evaluator.metricname: "mse"})
RMSE = evaluator.evaluate(predictionAndLabels, {evaluator.metricname: "rmse"})
# classification
lr = LogisticRegression()
lrModel = lr.fit(training)
predictedobsdf = lrModel.transform(test)
predictionAndLabels = predictedobsdf.select("probability", "label")
 # binary classification metrics
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability")
AUPR = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "areaUnderPR"})
AUROC = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "areaUnderROC"})
 # accuracy
predictionAndLabels = predictedobsdf.select("prediction", "label")
tests = predictionAndLabels.rdd.map(lambda pair: pair[0]==pair[1]).collect()
nTests = len(tests)
nHits = tests.count(True)
accuracy = float(nHits)/nTests
# multiclass
predictionAndLabels = predictedobsdf.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(rawPredictionCol="prediction")
precision = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "precision"})
recall = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "recall"})
F = evaluator.evaluate(predictionAndLabels, {evaluator.metricName: "f1"})
newobsdf = sqlContext.createDataFrame([
    (Vectors.dense([1.0, 0.0, 10.0]))
    ], ["features"])
lrModel.transform(newobsdf).select("prediction").show() # predict

##### Algoritms regression - Linear Regression:

In [None]:
# MLlib RDD-based API (Spark MLlib): https://spark.apache.org/docs/latest/mllib-linear-methods.html
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.mllib import RegressionMetrics
# load file
csvfile = "file:///.../student-mat.csv"
data = sc.textFile(csvfile)
# numeric transformation
data = data.map(lambda x: x.replace("M", "1"))
data = data.map(lambda x: x.replace("F", "2"))
data = data.map(lambda x: x.replace("yes", "1"))
data = data.map(lambda x: x.replace("no", "0"))
datacsv = data.map(lambda x: x.split(";"))
    # non binary version
# LabeledPoint generation
labeleddata = datacsv.map(lambda x: LabeledPoint(x[8], x[:7]))
# train-test sets generation
training, test = labeleddata.randomSplit([0.8, 0.2])
    # binary version
def binarize(v):
    if int(v[8])>10:
        v[8] = 0
    else:
        v[8] = 1
    return v
datacsvbin = datacsv.map(binarize)
# LabeledPoint generation
labeleddatabin = datacsvbin.map(lambda x: LabeledPoint(x[8], x[:7]))
# train-test sets generation
trainingbin, testbin = labeleddatabin.randomSplit([0.8, 0.2])
# training
model = LinearRegressionWithSGD.train(training) # with Stochastic Gradient Descent
print(model.weights)
# evaluation
valuesAndPreds = test.map(lambda p: (float(model.predict(p.features)), p.label))
metrics = RegressionMetrics(valuesAndPreds)
print("MAE = ", metrics.meanAbsoluteError)


# MLlib DataFrame-based API (Spark Ml): 
from pyspark.sql import Row, SQLContext
from pyspark.mllib.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
# load file
csvfile = "file:///.../student-mat.csv"
sqlContext = SQLContext(sc)
    # non binary version
# dataframe creation
dfs = datacsv.map(lambda x: (float(x[8]), Vectors.dense(x[0:7])))
datadf = sqlContext.createDataFrame(dfs, ["label", "features"])
# train-test sets generation
traindf, testdf = datadf.randomSplit([0.8, 0.2])
    # binary version
def binarize(v):
    if int(v[8])>10:
        v[8] = 0
    else:
        v[8] = 1
    return v
datacsvbin = datacsv.map(binarize)
# dataframe creation
dfsbin = datacsvbin.map(lambda x: (float(x[8]), Vectors.dense(x[0:7])))
datadfbin = sqlContext.createDataFrame(dfsbin, ["label", "features"])
# train-test sets generation
traindfbin, testdfbin = datadfbin.randomSplit([0.8, 0.2])
# training
trainer = LinearRegression(traindfbin) # it just supports binary classification
model = trainer.fit(traindfbin)
print("Coefficients: ", str(model.coefficients))
# evaluation
predicteddf = model.transform(testdfbin)
predictionAndLabels = predicteddf.select("prediction", "label")
evaluator = RegressionEvaluator()
MAE = evaluator.evaluate(predictionAndLabels), {evaluator.metricName:"mae"}

##### Algoritms regression - Decision Trees:

In [None]:
# MLlib RDD-based API (Spark MLlib): https://spark.apache.org/docs/latest/mllib-linear-methods.html
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.evaluation import RegressionMetrics
# model
model = DecisionTree.trainRegressor(training, categoricalFeaturesInfo={}, impurity='variance')
print(model.toDebugString()) # shows DecisionTreeModel info
# evaluation
predictions = model.predict(test.map(lambda x: x.features)) # need to do it separately, does not support a map with a prediction inside
valuesAndPreds = test.map(lambda lp: lp.label).zip(predictions)
metrics = RegressionMetrics(valuesAndPreds)
print("MAE = ", metrics.meanAbsoluteError)

# MLlib DataFrame-based API (Spark Ml): 
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
# model 
# this API requires at least that Y is indexed
labelIndexer = StringIndexer(inputCol = "label", outputCol = "indexedLabel").fit(datadf)
featureIndexer = VectorIndexer(inputCol = "features", outputCol = "indexedFeatures", maxCategories=21).fit(datadf)
trainer = StringIndexer(labelCol = "indexedLabel", featurestCol = "indexedFeatures")
pipeline = Pipeline(stages = [labelIndexer, featureIndexer, trainer])
model = pipeline.fit(trainingdf)
# evaluation
predicteddf = model.transform(testdf)
predictionAndLabels = predicteddf.select("prediction", "label")
evaluator = RegressionEvaluator()
MAE = evaluator.evaluate(predictionAndLAbels, {evaluator.metricName : "mae"})

##### Algoritms regression - Decision Tree Ensembles (Random Forest & Gradient Boosting):

In [None]:
# MLlib RDD-based API (Spark MLlib): https://spark.apache.org/docs/latest/mllib-linear-methods.html
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.evaluation import RegressionMetrics
# model
modelRF = RandomForest.trainRegressor(training, categoricalFeaturesInfo={}, numTrees=3)
modelFB = GradientBoostedTrees.trainRegressor(training, categoricalFeaturesInfo={}, numIterations=3)
print(model.toDebugString()) # shows DecisionTreeModel info
# evaluation
predictions = model.predict(test.map(lambda x: x.features)) # need to do it separately, does not support a map with a prediction inside
valuesAndPreds = test.map(lambda lp: lp.label).zip(predictions)
metrics = RegressionMetrics(valuesAndPreds)
print("MAE = ", metrics.meanAbsoluteError)

##### Algoritms classification - Support Vector Machines:

In [None]:
# MLlib RDD-based API (Spark MLlib): https://spark.apache.org/docs/latest/mllib-linear-methods.html
from pyspark.mllib.classification import SVMwithSGD, SVMModel
from pyspark.mllib.evaluation import MulticlassMetrics
# model
# SVMwithSGD just supports binary problems
model = SVMWithSGD.train(trainingbin, iterations=100)
print(model.weights)
# evaluation
predictionAndLabels = testbin.map(lambda p: (p.label, float(model.predict(p.features))))
metrics = MulticlassMetrics(predictionAndLabels)
print("F1 = ", metrics.fMeasure())

##### Algoritms classification - Bayes:

In [None]:
# MLlib RDD-based API (Spark MLlib): https://spark.apache.org/docs/latest/mllib-linear-methods.html
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.evaluation import MulticlassMetrics
# model
model = NaiveBayes.train(training, 1.0)
print(model.pi) # model class probs
print(model.theta) # model feature probs
# evaluation
predictionAndLabels = testbin.map(lambda p: (p.label, float(model.predict(p.features))))
metrics = MulticlassMetrics(predictionAndLabels)
print("F1 = ", metrics.fMeasure())

## Recommender systems

##### ALS algorithm (Alternating Least Squares)

In [13]:
from pyspark.mllib.recommendation import Rating, ALS

In [8]:
# specific structure for recommendation Rating(user, product, rating)
ratings = sc.parallelize(
            [Rating(1,1,1.0),
             Rating(1,2,1.0),
             Rating(2,1,1.0),
             Rating(2,3,1.0),
             Rating(3,2,1.0)])

In [10]:
# training
model = ALS.train(ratings, rank=2)

In [12]:
# prediction
model.recommendProducts(1,3) # 3 products for user 1
# careful because it will give us back also the products already consumed --> need to filter them in app

[Rating(user=1, product=1, rating=0.9955445398917746),
 Rating(user=1, product=2, rating=0.9843777028424281),
 Rating(user=1, product=3, rating=-0.3855545946576022)]

## Saving a model

In [None]:
model.save(sc,”rutaDeMiArchivoDeModelo”)
model = LinearRegressionWithSGD.load(sc ,”rutaDeMiArchivoDeModelo”)

## Pipelines 
#### Spark MLlib DataFrame-based API

In [8]:
from pyspark.sql import SQLContext, Row, DataFrame
from pyspark.ml import Pipeline
from pyspark.ml.feature import Bucketizer
from pyspark.ml.regression import LinearRegression
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
# Carga de datos
sqlContext = SQLContext(sc)

training = sqlContext.createDataFrame([
            (29, 1.85, 1.0),
            (26, 1.96, 3.0),
            (24, 2.05, 4.0),
            (30, 1.98, 3.0),
            (29, 1.91, 2.0),
            (19, 2.17, 5.0)],
            ["age", "height", "label"])

# Configurar un pipeline con tres etapas: bucketizer, assembler, lr
bucketizer = Bucketizer(splits=[-float("inf"), 1.90, 2.00, float("inf")], inputCol="height", outputCol="bucketHeight")
assembler = VectorAssembler(inputCols=["age", "bucketHeight"],outputCol="features")
lr = LinearRegression()
pipeline = Pipeline(stages=[bucketizer, assembler, lr])

# Entrenamiento
model = pipeline.fit(training)

# Carga de datos de predicción = igual encabezado sin “label”
test = sqlContext.createDataFrame([(26, 1.87),(28, 1.99)],["age", "height"])

# Predicción
prediction = model.transform(test)
selected = prediction.select("age", "height", "prediction")
for row in selected.collect():
    print(row)

Row(age=26, height=1.87, prediction=1.469387755102058)
Row(age=28, height=1.99, prediction=2.591836734693874)


## Text classifiers

In [11]:
from pyspark.ml.feature import Tokenizer

In [14]:
# Creamos un DataFrame con la columna “text”
df = sqlContext.createDataFrame([("a B c 1 he's",)], ["text"])
# Creamos el tokenizador, con columna origen y columna destino
tokenizer = Tokenizer(inputCol="text", outputCol="words")
# No hay método fit(), solo transform()
tokenizer.transform(df).head() # Se observa que pasa a minúsculas y no reconoce “’” como separador

Row(text="a B c 1 he's", words=['a', 'b', 'c', '1', "he's"])

In [16]:
from pyspark.ml.feature import RegexTokenizer

In [17]:
# Creamos el tokenizador
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
# Creamos un DataFrame
df = sqlContext.createDataFrame([("Tengo 2, y tú no",)], ["text"])
reTokenizer.transform(df).head() # No ha reconocido el acento

Row(text='Tengo 2, y tú no', words=['tengo', '2,', 'y', 'tú', 'no'])

In [29]:
# Creamos un nuevo DataFrame
df = sqlContext.createDataFrame([('correo correo@ejemplo.com',)], ["text"])

# Expresión regular por defecto
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words")
print(reTokenizer.transform(df).head())

# Expresión regular = blancos, “@” y “.”
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="[\s@\.]+", gaps=True)
print(reTokenizer.transform(df).head())

# Misma expresión regular, pero devuelve los separadores
reTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="[\s@\.]+", gaps=False)
print(reTokenizer.transform(df).head())

Row(text='correo correo@ejemplo.com', words=['correo', 'correo@ejemplo.com'])
Row(text='correo correo@ejemplo.com', words=['correo', 'correo', 'ejemplo', 'com'])
Row(text='correo correo@ejemplo.com', words=[' ', '@', '.'])


In [24]:
from pyspark.sql import Row
from pyspark.ml.feature import NGram

In [35]:
# Creamos un DataFrame
df = sqlContext.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])])

# Creamos un objeto NGram con longitud de secuencia 2
ngram = NGram(n=2, inputCol="inputTokens", outputCol="nGrams")
print(ngram.transform(df).head())

# Cambiamos la longitud de secuencia a 4
print(ngram.setParams(n=4).transform(df).head())


Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b', 'b c', 'c d', 'd e'])
Row(inputTokens=['a', 'b', 'c', 'd', 'e'], nGrams=['a b c d', 'b c d e'])


In [36]:
from pyspark.ml.feature import StopWordsRemover

In [37]:
# Creamos un DataFrame
df = sqlContext.createDataFrame([Row(inputTokens=["a", "b", "c", "d", "e"])])
# Creamos un objeto StopWordsRemover
stopwords = StopWordsRemover(inputCol="inputTokens", outputCol="outputTokens", stopWords=["a","b"])
print(stopwords.transform(df).head())

# Estos objetos incluyen una lista de parada por defecto para el inglés
print(stopwords.explainParams())

Row(inputTokens=['a', 'b', 'c', 'd', 'e'], outputTokens=['c', 'd', 'e'])
caseSensitive: whether to do a case sensitive comparison over the stop words (default: False)
inputCol: input column name. (current: inputTokens)
inputCols: input column names. (undefined)
locale: locale of the input. ignored when case sensitive is true (default: en_GB)
outputCol: output column name. (default: StopWordsRemover_727d836dd65e__output, current: outputTokens)
outputCols: output column names. (undefined)
stopWords: The words to be filtered out (default: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 

In [41]:
df = sqlContext.createDataFrame([("You are a friend of mine",)], ["text"])

# Tokenización del texto de entrada
tokenizer = Tokenizer(inputCol="text", outputCol="words")
df2=tokenizer.transform(df)
df2.show()

# Eliminación de palabras en la lista de parada
stopwords = StopWordsRemover(inputCol="words", outputCol="filteredWords")
print(stopwords.transform(df2).head())

+--------------------+--------------------+
|                text|               words|
+--------------------+--------------------+
|You are a friend ...|[you, are, a, fri...|
+--------------------+--------------------+

Row(text='You are a friend of mine', words=['you', 'are', 'a', 'friend', 'of', 'mine'], filteredWords=['friend', 'mine'])


**Indexación y sistemas de pesos**

In [45]:
from pyspark.ml.feature import CountVectorizer

# Creamos un DataFrame de textos como secuencias de palabras, con una etiqueta
df = sqlContext.createDataFrame([(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],["label", "raw"])

# Definimos el objeto CountVectorizer con sus parámetros
cv = CountVectorizer(inputCol="raw", outputCol="vectors")

# Entrenamos = generamos el vocabulario
model = cv.fit(df)

# Transformamos = representamos respecto a dicho vocabulario
model.transform(df).show(truncate=False)

# Imprimimos el Vocabulario
sorted(map(str, model.vocabulary))

print(cv.explainParams())


+-----+---------------+-------------------------+
|label|raw            |vectors                  |
+-----+---------------+-------------------------+
|0    |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1    |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+-----+---------------+-------------------------+

binary: Binary toggle to control the output vector values. If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts. Default False (default: False)
inputCol: input column name. (current: raw)
maxDF: Specifies the maximum number of different documents a term could appear in to be included in the vocabulary. A term that appears more than the threshold will be ignored. If this is an integer >= 1, this specifies the maximum number of documents the term could appear in; if this is a double in [0,1), then this specifies the maximum fraction of documents the term could appear in. Defa

In [46]:
from pyspark.ml.feature import HashingTF
# Creamos un DataFrame de listas de palabras
df = sqlContext.createDataFrame([(["a", "b", "c", "c"],)], ["words"])
# Creamos un objeto HashingTF con 10 atributos
hashingTF = HashingTF(numFeatures=10, inputCol="words", outputCol="features")
hashingTF.transform(df).head().features

SparseVector(10, {5: 1.0, 7: 1.0, 8: 2.0})

In [62]:
# Computa pesos IDF sobre una colección de vectores de documentos
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import IDF, IDFModel
df = sqlContext.createDataFrame([(DenseVector([1.0, 2.0]),),(DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], ["tf"])
idf = IDF(minDocFreq=3, inputCol="tf", outputCol="idf")
model = idf.fit(df)
model.transform(df).head().idf

DenseVector([0.0, 0.0])

## Text classifier - Sentiment analysis

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import Row
from pyspark.sql import SQLContext, Row, DataFrame

sqlContext = SQLContext(sc)

# Creamos una colección de documentos que son ternas (id, text, label)
training = sqlContext.createDataFrame([
        (0, "me gusta el iphone", 1.0),
        (1, "odio el samsung galaxy", 0.0),
        (2, "me gusta el galaxy", 1.0),
        (3, "no me gusta el nokia communicator", 0.0)], ["id", "text", "label"])

# Configuramos un pipeline de tres etapas: tokenizer, hashingTF, y lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Entrenamos el pipeline sobre los datos de entrenamiento
model = pipeline.fit(training)

In [8]:
# Creamos una colección de prueba, que son pares (id, text)
test = sqlContext.createDataFrame([
    (4, "me gusta spark"),
    (5, "odio spark"),
    (6, "no me gusta la fruta"),
    (7, "odio la fruta")], ["id", "text"])

# Clasificamos con el pipeline entrenado, que es el objeto model
prediction = model.transform(test)
selected = prediction.select("id", "text", "prediction")
for row in selected.collect():
    print(row)

Row(id=4, text='me gusta spark', prediction=1.0)
Row(id=5, text='odio spark', prediction=0.0)
Row(id=6, text='no me gusta la fruta', prediction=1.0)
Row(id=7, text='odio la fruta', prediction=0.0)
