In [1]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.ml.linalg import SparseVector

# Create a SparseVector for a text classification task (e.g., spam classification)
print("Creating a SparseVector for text classification:")

# Label indicating whether the email is spam (1.0) or not (0.0)
label = 1.0  # Indicates spam

# Features represent word counts in an email
# For example, the first word is "discount," the second word is "free," and so on.
# [0, 2] means the words "discount" and "free" are present,
# and [1.0, 3.0] means "discount" occurs once, and "free" occurs three times.
print("Creating features:")
features = SparseVector(5, [0, 2], [1.0, 3.0])

# Display the label and features
print(f'Label: {label}')
print(f'Features (SparseVector):{features}')

Creating a SparseVector for text classification:
Creating features:
Label: 1.0
Features (SparseVector):(5,[0,2],[1.0,3.0])


In [2]:
# Import libraries
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint

# Simulated Example: Email Spam Detection
## Create a labeled point with a positive label and a dense feature vector
pos_label = 1.0
# Features represent: (word 'hello' count, word 'offer' count, number of links etc.,)
pos_features_dense = [1.0, 0.0, 3.0]
pos = LabeledPoint(pos_label, pos_features_dense)
print(f'Positive Example {pos}:')

## Create a labeled point with a negative label and a sparse feature vector
neg_label = 0.0
# Features represent: (word 'hello' count, word 'offer' count, number of links etc.,)
neg_features_sparse = SparseVector(3, [0, 2], [1.0, 3.0])
neg = LabeledPoint(neg_label, neg_features_sparse)
print(f'Negative Example: {neg}')

Positive Example (1.0,[1.0,0.0,3.0]):
Negative Example: (0.0,(3,[0,2],[1.0,3.0]))


In [4]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the Dataset
print("Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Tokenize the text
print("Tokenizing the text")
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
words = tokenizer.transform(df)

# Apply the `HashingTF` transformer
print("Applying HashingTF transformation to convert words into features")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(words)
featurizedData.show()

Loading the Amazon Product Reviews dataset
Tokenizing the text
Applying HashingTF transformation to convert words into features
+------+--------------------+--------------------+--------------------+--------------------+
|Rating|               Title|              Review|               words|         rawFeatures|
+------+--------------------+--------------------+--------------------+--------------------+
|     1|Stuning even for ...|This sound track ...|[this, sound, tra...|(20,[0,1,3,4,5,6,...|
|     1|The best soundtra...|I'm reading a lot...|[i'm, reading, a,...|(20,[0,1,2,3,4,5,...|
|     1|            Amazing!|"This soundtrack ...|["this, soundtrac...|(20,[0,2,3,4,5,6,...|
|     1|Excellent Soundtrack|I truly like this...|[i, truly, like, ...|(20,[0,1,2,3,4,5,...|
|     1|Remember, Pull Yo...|If you've played ...|[if, you've, play...|(20,[0,1,2,3,4,5,...|
|     1|an absolute maste...|I am quite sure a...|[i, am, quite, su...|(20,[0,1,2,3,4,5,...|
|     0|        Buyer beware|"This 

# TF-IDF (term frequency-inverse document frequency)

In [5]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the Dataset
print("Step1: Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Tokenize the text
print("Step 2: Tokenizing the text")
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
words = tokenizer.transform(df)

# Apply the `HashingTF` Transformer
print("Step 3: Applying HashingTF transformation to convert words into features")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(words)

# Apply IDF to rescale the raw TF
print("Step 4: Applying IDF to rescale the raw features")
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)

# Transform the featurized data
print("Step 5: Transforming the rescaled featured data")
rescaledData = idfModel.transform(featurizedData)

# Select the ID and features columns
print("Displaying the 'Rating' and 'features' columns")
rescaledData.select("Rating", "features").show()

Step1: Loading the Amazon Product Reviews dataset
Step 2: Tokenizing the text
Step 3: Applying HashingTF transformation to convert words into features
Step 4: Applying IDF to rescale the raw features
Step 5: Transforming the rescaled featured data
Displaying the 'Rating' and 'features' columns
+------+--------------------+
|Rating|            features|
+------+--------------------+
|     1|(20,[0,1,3,4,5,6,...|
|     1|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,2,3,4,5,6,...|
|     1|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,4,5,...|
|     0|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,4,5,...|
|     0|(20,[0,1,3,4,6,7,...|
|     1|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,5,6,...|
|     0|(20,[0,1,2,3,4,5,...|
|     0|(20,[0,1,2,3,4,5,...|
|     0|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,4,5,...|
|     1|(20,[0,1,2,3,4,5,...|
|     0|(20,[0,1,3,4,5,6,...|
+------+-------

# Word2Vec

In [6]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, Word2Vec

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the Dataset
print("Step 1: Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Tokenize the text
print("Step 2: Tokenizing the text")
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
words = tokenizer.transform(df)

# Create a Word2Vec and fit the Word2Vec model
print("Step 3: Create a Word2Vec feature extraction")
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="words", outputCol="result")
print("Fit the model")
model = word2Vec.fit(words)

# Transform the data using the trained Word2Vec model
print("Step 4: Transform the model")
result = model.transform(words)

# Print the transformed data
print("Step 5: Print the tranformed data")
for row in result.select("Review", "result").collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (text, vector))

Step 1: Loading the Amazon Product Reviews dataset
Step 2: Tokenizing the text
Step 3: Create a Word2Vec feature extraction
Fit the model


24/10/01 19:09:35 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/01 19:09:35 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


Step 4: Transform the model
Step 5: Print the tranformed data
Text: [This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^] => 
Vector: [-0.0015845583334642775,-0.025008490451922022,-0.007626214157789946,0.0008982458959993286,-0.014928634186896186,0.01996753474076589,0.004927067707758397,0.011501849337946624,-0.014161042856673401,-0.011615786196974417,0.004535496182894955,0.013640607584578296,0.01672013845915596,0.016148326626668374,-0.003099621023866348,-0.01865020653853814,0.015940518599624438,-0.01721643411864837,0.0070323015206183,-0.0020925465140802166,-0.003506271531805396,-0.01803680199198425,0.013294978168172142,0.010091892949615917,0.00338227

# CountVectorizer

In [7]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, CountVectorizer

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the Dataset
print("Step 1: Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Tokenize the text
print("Step 2: Tokenizing the text")
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
words = tokenizer.transform(df)

# Create a CountVectorizer and fit the CountVectorizer model
print("Step 3: Create a CountVectorizer feature extraction")
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=0.1)
print("Fit the model")
model = cv.fit(words)

# Transform the data using the trained CountVectorizer model
print("Step 4: Transform the model")
result = model.transform(words)

# Print the transformed data
print("Print the tranformed data")
result.show()

Step 1: Loading the Amazon Product Reviews dataset
Step 2: Tokenizing the text
Step 3: Create a CountVectorizer feature extraction
Fit the model
Step 4: Transform the model
Print the tranformed data
+------+--------------------+--------------------+--------------------+--------------------+
|Rating|               Title|              Review|               words|            features|
+------+--------------------+--------------------+--------------------+--------------------+
|     1|Stuning even for ...|This sound track ...|[this, sound, tra...|(3,[0,1,2],[4.0,2...|
|     1|The best soundtra...|I'm reading a lot...|[i'm, reading, a,...|(3,[0,1,2],[3.0,3...|
|     1|            Amazing!|"This soundtrack ...|["this, soundtrac...| (3,[0,1],[6.0,2.0])|
|     1|Excellent Soundtrack|I truly like this...|[i, truly, like, ...|(3,[0,1,2],[4.0,9...|
|     1|Remember, Pull Yo...|If you've played ...|[if, you've, play...| (3,[0,1],[8.0,3.0])|
|     1|an absolute maste...|I am quite sure a...|[i, am,

# Tokenization


In [8]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the dataset
print("Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Tokenize the text
print("Tokenizing the text")
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
words = tokenizer.transform(df)

# Loop through the words DataFrame and print tokenized words
print("Tokenized words:")
for row in words.collect():
    print(row["words"])

Loading the Amazon Product Reviews dataset
Tokenizing the text
Tokenized words:
['this', 'sound', 'track', 'was', 'beautiful!', 'it', 'paints', 'the', 'senery', 'in', 'your', 'mind', 'so', 'well', 'i', 'would', 'recomend', 'it', 'even', 'to', 'people', 'who', 'hate', 'vid.', 'game', 'music!', 'i', 'have', 'played', 'the', 'game', 'chrono', 'cross', 'but', 'out', 'of', 'all', 'of', 'the', 'games', 'i', 'have', 'ever', 'played', 'it', 'has', 'the', 'best', 'music!', 'it', 'backs', 'away', 'from', 'crude', 'keyboarding', 'and', 'takes', 'a', 'fresher', 'step', 'with', 'grate', 'guitars', 'and', 'soulful', 'orchestras.', 'it', 'would', 'impress', 'anyone', 'who', 'cares', 'to', 'listen!', '^_^']
["i'm", 'reading', 'a', 'lot', 'of', 'reviews', 'saying', 'that', 'this', 'is', 'the', 'best', "'game", "soundtrack'", 'and', 'i', 'figured', 'that', "i'd", 'write', 'a', 'review', 'to', 'disagree', 'a', 'bit.', 'this', 'in', 'my', 'opinino', 'is', 'yasunori', "mitsuda's", 'ultimate', 'masterpiece.

# Scaling and normalization

In [9]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StandardScaler

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the dataset
print("Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Tokenize the text
print("Tokenizing the text")
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
words = tokenizer.transform(df)

# Apply the `HashingTF` Transformer
print("Applying HashingTF transformation to convert words into features")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(words)

# Apply IDF to rescale the raw TF
print("Applying IDF to rescale the raw features")
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)

# Transform the featurized data
print("Transforming the rescaled featured data")
rescaledData = idfModel.transform(featurizedData)

# StandardScaler
print("StandardScaler normalization of the rescaled features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)
scalerModel = scaler.fit(rescaledData)
scaledData = scalerModel.transform(rescaledData)

# Show the first row of scaled data
scaledData.show(1)

Loading the Amazon Product Reviews dataset
Tokenizing the text
Applying HashingTF transformation to convert words into features
Applying IDF to rescale the raw features
Transforming the rescaled featured data
StandardScaler normalization of the rescaled features
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Rating|               Title|              Review|               words|         rawFeatures|            features|      scaledFeatures|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     1|Stuning even for ...|This sound track ...|[this, sound, tra...|(20,[0,1,3,4,5,6,...|(20,[0,1,3,4,5,6,...|(20,[0,1,3,4,5,6,...|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



# MinMaxScaler

In [10]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, MinMaxScaler

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the dataset
print("Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Tokenize the text
print("Tokenizing the text")
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
words = tokenizer.transform(df)

# Apply the `HashingTF` transformer
print("Applying HashingTF transformation to convert words into features")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(words)

# Apply IDF to rescale the raw TF
print("Applying IDF to rescale the raw features")
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)

# Transform the featurized data
print("Transforming the rescaled featured data")
rescaledData = idfModel.transform(featurizedData)

# MinMaxScaler
print("MinMax scaling of the rescaled features")
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(rescaledData)
scaledData = scalerModel.transform(rescaledData)

# Show the scaled data
scaledData.show(1)

Loading the Amazon Product Reviews dataset
Tokenizing the text
Applying HashingTF transformation to convert words into features
Applying IDF to rescale the raw features
Transforming the rescaled featured data
MinMax scaling of the rescaled features
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Rating|               Title|              Review|               words|         rawFeatures|            features|      scaledFeatures|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     1|Stuning even for ...|This sound track ...|[this, sound, tra...|(20,[0,1,3,4,5,6,...|(20,[0,1,3,4,5,6,...|[0.09090909090909...|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



# PCA

In [11]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, Tokenizer, PCA

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the Dataset
print("Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Tokenize the text
print("Tokenizing the text")
tokenizer = Tokenizer(inputCol="Review", outputCol="words")
words = tokenizer.transform(df)

# Apply the `HashingTF` Transformer
print("Applying HashingTF transformation to convert words into features")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(words)

# PCA
# The 'k' parameter specifies the number of principal components to retain.
pca = PCA(k=3, inputCol="rawFeatures", outputCol="pcaFeatures")
pca_model = pca.fit(featurizedData)
pca_model_trans = pca_model.transform(featurizedData)
pca_model_trans.select('rawFeatures', 'pcaFeatures').show()

Loading the Amazon Product Reviews dataset
Tokenizing the text
Applying HashingTF transformation to convert words into features
+--------------------+--------------------+
|         rawFeatures|         pcaFeatures|
+--------------------+--------------------+
|(20,[0,1,3,4,5,6,...|[-17.823655459129...|
|(20,[0,1,2,3,4,5,...|[-20.928457501274...|
|(20,[0,2,3,4,5,6,...|[-15.300050768085...|
|(20,[0,1,2,3,4,5,...|[-28.180169415074...|
|(20,[0,1,2,3,4,5,...|[-19.164134814672...|
|(20,[0,1,2,3,4,5,...|[-31.176578722534...|
|(20,[0,1,2,3,4,5,...|[-26.480221753454...|
|(20,[0,1,2,3,4,5,...|[-23.550147802014...|
|(20,[0,1,2,3,4,5,...|[-23.665200962500...|
|(20,[0,1,2,3,4,5,...|[-12.557865066993...|
|(20,[0,1,3,4,6,7,...|[-7.1407035848593...|
|(20,[0,1,2,3,4,5,...|[-19.390719577852...|
|(20,[0,1,2,3,5,6,...|[-14.059596061138...|
|(20,[0,1,2,3,4,5,...|[-21.363443450150...|
|(20,[0,1,2,3,4,5,...|[-16.960758182097...|
|(20,[0,1,2,3,4,5,...|[-10.317168157493...|
|(20,[0,1,2,3,4,5,...|[-10.693039192

24/10/01 19:11:47 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/10/01 19:11:47 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


# OneHotEncoder

In [13]:
# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Read the Dataset
print("Loading the Amazon Product Reviews dataset")
df = spark.read.csv("Amazon_Reviews.csv", header=True, inferSchema=True)

# Use StringIndexer to convert the "region" column into categorical indices
indexer = StringIndexer(inputCol="Region", outputCol="region_indexed")
indexed = indexer.fit(df).transform(df)

# Use OneHotEncoder to encode the indexed categorical features
encoder = OneHotEncoder(inputCol="region_indexed", outputCol="region_encoded")
encodedData = encoder.fit(indexed).transform(indexed)

# Show the original "region" column along with the encoded "region_encoded" column
encodedData.select("region", "region_encoded").show(10)

Loading the Amazon Product Reviews dataset


Py4JJavaError: An error occurred while calling o697.fit.
: org.apache.spark.SparkException: Input column Region does not exist.
	at org.apache.spark.ml.feature.StringIndexerBase.$anonfun$validateAndTransformSchema$2(StringIndexer.scala:128)
	at scala.collection.TraversableLike.$anonfun$flatMap$1(TraversableLike.scala:293)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.flatMap(TraversableLike.scala:293)
	at scala.collection.TraversableLike.flatMap$(TraversableLike.scala:290)
	at scala.collection.mutable.ArrayOps$ofRef.flatMap(ArrayOps.scala:198)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema(StringIndexer.scala:123)
	at org.apache.spark.ml.feature.StringIndexerBase.validateAndTransformSchema$(StringIndexer.scala:115)
	at org.apache.spark.ml.feature.StringIndexer.validateAndTransformSchema(StringIndexer.scala:145)
	at org.apache.spark.ml.feature.StringIndexer.transformSchema(StringIndexer.scala:252)
	at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:71)
	at org.apache.spark.ml.feature.StringIndexer.fit(StringIndexer.scala:237)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
