# Screencast Code

The follow code is the same used in the "Numeric Features" screencast. Run each code cell to see how 

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, Normalizer, StandardScaler
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import re

In [2]:
# create a SparkSession: note this step was left out of the screencast
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/12 19:25:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/12 19:25:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/01/12 19:25:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# Read in the Data Set

In [3]:
stack_overflow_data = 'Train_onetag_small.json'

In [4]:
df = spark.read.json(stack_overflow_data)

                                                                                

In [5]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php')

In [6]:
df.select("*").limit(5).toPandas()

Unnamed: 0,Body,Id,Tags,Title,oneTag
0,<p>I'd like to check if an uploaded file is an...,1,php image-processing file-upload upload mime-t...,How to check if an uploaded file is an image w...,php
1,"<p>In my favorite editor (vim), I regularly us...",2,firefox,How can I prevent firefox from closing when I ...,firefox
2,<p>I am import matlab file and construct a dat...,3,r matlab machine-learning,R Error Invalid type (list) for variable,r
3,"<p>This is probably very simple, but I simply ...",4,c# url encoding,How do I replace special characters in a URL?,c#
4,<pre><code>function modify(.......)\n{\n $mco...,5,php api file-get-contents,How to modify whois contact details?,php


# Tokenization

Tokenization splits strings into separate words. Spark has a [Tokenizer](https://spark.apache.org/docs/latest/ml-features.html#tokenizer) class as well as RegexTokenizer, which allows for more control over the tokenization process.

In [7]:
# split the body text into separate words

regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [8]:
df.select("*").limit(5).toPandas()

Unnamed: 0,Body,Id,Tags,Title,oneTag,words
0,<p>I'd like to check if an uploaded file is an...,1,php image-processing file-upload upload mime-t...,How to check if an uploaded file is an image w...,php,"[p, i, d, like, to, check, if, an, uploaded, f..."
1,"<p>In my favorite editor (vim), I regularly us...",2,firefox,How can I prevent firefox from closing when I ...,firefox,"[p, in, my, favorite, editor, vim, i, regularl..."
2,<p>I am import matlab file and construct a dat...,3,r matlab machine-learning,R Error Invalid type (list) for variable,r,"[p, i, am, import, matlab, file, and, construc..."
3,"<p>This is probably very simple, but I simply ...",4,c# url encoding,How do I replace special characters in a URL?,c#,"[p, this, is, probably, very, simple, but, i, ..."
4,<pre><code>function modify(.......)\n{\n $mco...,5,php api file-get-contents,How to modify whois contact details?,php,"[pre, code, function, modify, mcontact, file_g..."


In [9]:
# count the number of words in each body tag

body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn("BodyLength", body_length(df.words))

In [10]:
df.select("*").limit(5).toPandas()

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

Unnamed: 0,Body,Id,Tags,Title,oneTag,words,BodyLength
0,<p>I'd like to check if an uploaded file is an...,1,php image-processing file-upload upload mime-t...,How to check if an uploaded file is an image w...,php,"[p, i, d, like, to, check, if, an, uploaded, f...",83
1,"<p>In my favorite editor (vim), I regularly us...",2,firefox,How can I prevent firefox from closing when I ...,firefox,"[p, in, my, favorite, editor, vim, i, regularl...",71
2,<p>I am import matlab file and construct a dat...,3,r matlab machine-learning,R Error Invalid type (list) for variable,r,"[p, i, am, import, matlab, file, and, construc...",3161
3,"<p>This is probably very simple, but I simply ...",4,c# url encoding,How do I replace special characters in a URL?,c#,"[p, this, is, probably, very, simple, but, i, ...",115
4,<pre><code>function modify(.......)\n{\n $mco...,5,php api file-get-contents,How to modify whois contact details?,php,"[pre, code, function, modify, mcontact, file_g...",148


In [11]:
# count the number of paragraphs and links in each body tag

number_of_paragraphs = udf(lambda x: len(re.findall("</p>", x)), IntegerType())
number_of_links = udf(lambda x: len(re.findall("</a>", x)), IntegerType())

In [12]:
df = df.withColumn("NumParagraphs", number_of_paragraphs(df.Body))
df = df.withColumn("NumLinks", number_of_links(df.Body))

In [13]:
df.head(2)

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [14]:
df.select("*").limit(2).toPandas()

Unnamed: 0,Body,Id,Tags,Title,oneTag,words,BodyLength,NumParagraphs,NumLinks
0,<p>I'd like to check if an uploaded file is an...,1,php image-processing file-upload upload mime-t...,How to check if an uploaded file is an image w...,php,"[p, i, d, like, to, check, if, an, uploaded, f...",83,2,0
1,"<p>In my favorite editor (vim), I regularly us...",2,firefox,How can I prevent firefox from closing when I ...,firefox,"[p, in, my, favorite, editor, vim, i, regularl...",71,2,0


# VectorAssembler

Combine the body length, number of paragraphs, and number of links columns into a vector

In [15]:
assembler = VectorAssembler(inputCols=["BodyLength", "NumParagraphs", "NumLinks"], outputCol="NumFeatures")
df = assembler.transform(df)

In [16]:
df.head()

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [17]:
df.select("*").limit(2).toPandas()

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

Unnamed: 0,Body,Id,Tags,Title,oneTag,words,BodyLength,NumParagraphs,NumLinks,NumFeatures
0,<p>I'd like to check if an uploaded file is an...,1,php image-processing file-upload upload mime-t...,How to check if an uploaded file is an image w...,php,"[p, i, d, like, to, check, if, an, uploaded, f...",83,2,0,"[83.0, 2.0, 0.0]"
1,"<p>In my favorite editor (vim), I regularly us...",2,firefox,How can I prevent firefox from closing when I ...,firefox,"[p, in, my, favorite, editor, vim, i, regularl...",71,2,0,"[71.0, 2.0, 0.0]"


# Normalize the Vectors

In [18]:
scaler = Normalizer(inputCol="NumFeatures", outputCol="ScaledNumFeatures")
df = scaler.transform(df)

In [19]:
df.head(2)

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [20]:
df.select("*").limit(2).toPandas()

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

Unnamed: 0,Body,Id,Tags,Title,oneTag,words,BodyLength,NumParagraphs,NumLinks,NumFeatures,ScaledNumFeatures
0,<p>I'd like to check if an uploaded file is an...,1,php image-processing file-upload upload mime-t...,How to check if an uploaded file is an image w...,php,"[p, i, d, like, to, check, if, an, uploaded, f...",83,2,0,"[83.0, 2.0, 0.0]","[0.9997098084674824, 0.02408939297512006, 0.0]"
1,"<p>In my favorite editor (vim), I regularly us...",2,firefox,How can I prevent firefox from closing when I ...,firefox,"[p, in, my, favorite, editor, vim, i, regularl...",71,2,0,"[71.0, 2.0, 0.0]","[0.999603489278623, 0.028157844768411915, 0.0]"


# Scale the Vectors

In [21]:
scaler2 = StandardScaler(inputCol="NumFeatures", outputCol="ScaledNumFeatures2", withStd=True)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)

                                                                                

In [22]:
df.head(2)

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/lib/python3.9/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [23]:
df.select("*").limit(2).toPandas()

                                                                                

Unnamed: 0,Body,Id,Tags,Title,oneTag,words,BodyLength,NumParagraphs,NumLinks,NumFeatures,ScaledNumFeatures,ScaledNumFeatures2
0,<p>I'd like to check if an uploaded file is an...,1,php image-processing file-upload upload mime-t...,How to check if an uploaded file is an image w...,php,"[p, i, d, like, to, check, if, an, uploaded, f...",83,2,0,"[83.0, 2.0, 0.0]","[0.9997098084674824, 0.02408939297512006, 0.0]","[0.4324585815323706, 0.703697372634422, 0.0]"
1,"<p>In my favorite editor (vim), I regularly us...",2,firefox,How can I prevent firefox from closing when I ...,firefox,"[p, in, my, favorite, editor, vim, i, regularl...",71,2,0,"[71.0, 2.0, 0.0]","[0.999603489278623, 0.028157844768411915, 0.0]","[0.36993444926263025, 0.703697372634422, 0.0]"
