# Pyspark

In [None]:
# sparkContext is the main entry point for creating RDDs
# sparkSession provides single point of entry to interact with spark dataframes
# sparkSession is available in pyspark as spark

In [None]:
# SparkContext version
sc.version

In [None]:
# python version
sc.pythonVer

In [None]:
# master
sc.masterfpr

In [None]:
# loading data in pyspark
rdd = sc.paralleize([1,2,3,4,5])
rdd2 = sc.textFile("test.txt")

# RDD 

In [None]:
# Print the version of SparkContext
print("The version of Spark Context in the PySpark shell is", sc.version)

# Print the Python version of SparkContext
print("The Python version of Spark Context in the PySpark shell is", sc.pythonVer)

# Print the master of SparkContext
print("The master of Spark Context in the PySpark shell is", sc.master)

+ map() function takes a function and a list and returns a new list which contains items returned by that function for each item
+ syntax of map - map(function, list)
+ filter() function takes a function and a list and returns a new list for which the function evaluates as true
+ filter(function, list)
+ lambda functions arguments:expression
+ lambda function with map - map(lambda <agument>:<expression>, iter)
+ lambda function with filter - filter(lambda <argument>:<expression>, list)

In [None]:
# Print my_list in the console
print("Input list is", my_list)

# Square all numbers in my_list
squared_list_lambda = list(map(lambda x: x**2, my_list))

# Print the result of the map function
print("The squared numbers are", squared_list_lambda)

In [None]:
# Print my_list2 in the console
print("Input list is:", my_list2)

# Filter numbers divisible by 10
filtered_list = list(filter(lambda x: (x%10 == 0), my_list2))

# Print the numbers divisible by 10
print("Numbers divisible by 10 are:", filtered_list)

In [None]:
numRDD = sc.parallelize([1,2,3,4])
type(numRDD) # Returns pyspark RDD

In [None]:
# Check the number of partitions in fileRDD
print("Number of partitions in fileRDD is", fileRDD.getNumPartitions())

# Create a fileRDD_part from file_path with 5 partitions
fileRDD_part = sc.textFile(file_path, minPartitions = 5) # This takes file path

# Check the number of partitions in fileRDD_part
print("Number of partitions in fileRDD_part is", fileRDD_part.getNumPartitions())

In [None]:
# Transformations - map, filter, flatmap, union
# flatmap transformation returns multiple values for each element in the original RDD- Example splitting the strings
RDD = sc.parallelize(["hello world", "How are you"])
RDD_flatmap = RDD.flatMap(lambda x:x.split(" "))

In [None]:
inputRDD = sc.textFile("logs.txt")
errorRDD = inputRDD.filter(lambda x:"error" in x.split())
warningsRDD = inputRDD.filter(lambda x:"warnings" in x.split())
combinedRDD = errorRDD.union(warningsRDD)

In [None]:
# Actions - collect, take, first, take
collect - Return all the elements of the dataset as an array
rdd_map.collect()

In [None]:
# Filter the fileRDD to select lines with Spark keyword
fileRDD_filter = fileRDD.filter(lambda line: 'Spark' in line)

# How many lines are there in fileRDD?
print("The total number of lines with the keyword Spark is", fileRDD_filter.count())

# Print the first four lines of fileRDD
for line in fileRDD_filter.take(4): 
	print(line)

In [None]:
# Pair RDDs in Pyspark
+ Each row is a key and maps to one or more values
+ pair RDD is a special kind of data structure to work with this kind of datasets
+ key is the identifier and value is data

In [None]:
my_tuple = [('Sam',23),('Mary',34),('Peter', 25)]
pairRDD_tuple = sc.parallelize(my_tuple) # from list of tuples

my_list = ['sam 23', 'Mary 34', 'Peter 25']
regular_rdd = sc.parallelize(my_list)
pairRDD_RDD = regularRDD.map(lambda s: (s.split(' ')[0]),(s.split(' ')[1]))

In [None]:
+ Pass functions to pairrdd which works on tuples rather than on individual elements
+ reduceByKey(func) - combine values with the same key
+ groupByKey - Group values with the same key
+ sortByKey 
+ join - Join two pair RDDs based on their key

In [None]:
# Create PairRDD Rdd with key value pairs
Rdd = sc.parallelize([(1,2),(3,4),(3,6),(4,5)])

# Apply reduceByKey() operation on Rdd
Rdd_Reduced = Rdd.reduceByKey(lambda x, y:x+y)

# Iterate over the result and print the output
for num in Rdd_Reduced.collect():
  print("Key {} has {} Counts".format(num[0], num[1]))

In [None]:
Rdd_Reduced_Sort = Rdd_Reduced.sortByKey(ascending=False)

In [None]:
# saveAsTextFile('tempFile') # saves RDD into a text file inside a directory with each partition as a separate file

In [None]:
# PairRDD actions - This should be used on small datasets
countByKey(k,v) - count the values for each key
rdd = sc.parallelize([("a",1),("b",1),("a",1)])
for key, val in rdd.countByKey().items():
    print(key,val)

# collectAsMap - Returns key-value pairs in the RDD as a dictionary
sc.parallelize([(1,2),(3,4)]).collectAsMap()

In [None]:
Create a base RDD and transform it
The volume of unstructured data (log lines, images, binary files) in existence is growing dramatically, and PySpark is an excellent framework for analyzing this type of data through RDDs. In this 3 part exercise, you will write code that calculates the most common words from Complete Works of William Shakespeare.

Here are the brief steps for writing the word counting program:

Create a base RDD from Complete_Shakespeare.txt file.
Use RDD transformation to create a long list of words from each element of the base RDD.
Remove stop words from your data.
Create pair RDD where each element is a pair tuple of ('w', 1)
Group the elements of the pair RDD by key (word) and add up their values.
Swap the keys (word) and values (counts) so that keys is count and value is the word.
Finally, sort the RDD by descending order and print the 10 most frequent words and their frequencies.
In this first exercise, you'll create a base RDD from Complete_Shakespeare.txt file and transform it to create a long list of words.

Remember, you already have a SparkContext sc already available in your workspace. A file_path variable (which is the path to the Complete_Shakespeare.txt file) is also loaded for you.

In [None]:
# Create a baseRDD from the file path
baseRDD = sc.textFile(file_path)

# Split the lines of baseRDD into words
splitRDD = baseRDD.flatMap(lambda x: x.split())

# Count the total number of words
print("Total number of words in splitRDD:", splitRDD.count())

In [None]:
# Convert the words in lower case and remove stop words from stop_words
splitRDD_no_stop = splitRDD.filter(lambda x: x.lower() not in stop_words)

# Create a tuple of the word and 1 
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (w, 1))

# Count of the number of occurences of each word
resultRDD = splitRDD_no_stop_words.reduceByKey(lambda x, y: x + y)

In [None]:
# Display the first 10 words and their frequencies
for word in resultRDD.take(10):
	print(word)

# Swap the keys and values 
resultRDD_swap = resultRDD.map(lambda x: (x[1], x[0]))

# Sort the keys in descending order
resultRDD_swap_sort = resultRDD_swap.sortByKey(ascending=False)

# Show the top 10 most frequent words and their frequencies
for word in resultRDD_swap_sort.take(10):
	print("{} has {} counts". format(word[1], word[0]))

# DataFrames

In [None]:
# Dataframes
+ From existing RDDs using sparksession's createDataFrame() method
+ from various sources using sparksession's read method
df_json = spark.read.json('people.json', header = True, inferSchema = True)
df_txt = spark.read.txt('people.txt', header = True, inferSchema = True)

In [None]:
# Converting DataFrames from RDDs
# Create a list of tuples
sample_list = [('Mona',20), ('Jennifer', 34), ('John',20), ('Jim',26)]

# Create a RDD from the list
rdd = sc.parallelize(sample_list)

# Create a PySpark DataFrame
names_df = spark.createDataFrame(rdd, schema=['Name', 'Age'])

# Check the type of names_df
print("The type of names_df is", type(names_df))

In [None]:
# Create an DataFrame from file_path
people_df = spark.read.csv(file_path, header=True, inferSchema=True)

# Check the type of people_df
print("The type of people_df is", type(people_df))

In [None]:
# DataFrame operations
+ Transformations - select, filter, groupby, orderby, dropDuplicates, withColumnRenamed()
+ Actions - printSchema, head, show, count, columns and describe

In [None]:
# Print the first 10 observations 
people_df.show(10)

# Count the number of rows 
print("There are {} rows in the people_df DataFrame.".format(people_df.count()))

# Count the number of columns and their names
print("There are {} columns in the people_df DataFrame and their names are {}".format(len(people_df.columns), people_df.columns))

In [None]:
# Select name, sex and date of birth columns
people_df_sub = people_df.select('name', 'sex', 'date of birth')

# Print the first 10 observations from people_df_sub
people_df_sub.show(10)

# Remove duplicate entries from people_df_sub
people_df_sub_nodup = people_df_sub.dropDuplicates()

# Count the number of rows
print("There were {} rows before removing duplicates, and {} rows after removing duplicates".format(people_df_sub.count(), people_df_sub_nodup.count()))

In [None]:
# Filter people_df to select females 
people_df_female = people_df.filter(people_df.sex == "female")

# Filter people_df to select males
people_df_male = people_df.filter(people_df.sex == "male")

# Count the number of rows 
print("There are {} rows in the people_df_female DataFrame and {} rows in the people_df_male DataFrame".format(people_df_female.count(), people_df_male.count()))

In [None]:
# sql() method executes sql query
# sql() method takes a sql statement as an argument and return the results as DataFrame
# convert dataframe to sql table 
df.createOrReplaceTempView("table1")

#### Data Visualization in pyspark

In [None]:
+ pyspark_dist_explore - hist, distplot, pandas_histogram
+ toPandas() - To convert into pandas dataframe
+ HandySpark - New library (Explore this option)

In [None]:
# Check the column names of names_df
print("The column names of names_df are", names_df.columns)

# Convert to Pandas DataFrame  
df_pandas = names_df.toPandas()

# Create a horizontal bar plot
df_pandas.plot(kind='barh', x='Name', y='Age', colormap='winter_r')
plt.show()

#### MLlib - Recommendations

In [None]:
# pyspark.mllib is for RDDs
# pyspark.ml is for DataFrames
# imports
from pyspark.mllib.recommendation import ALS
from pyspark.mllib.classification import LogisticRegressionWIthLBFGS
from pyspark.mllib.clustering import KMeans

In [None]:
# Rating class is a wrapper around tuple(user, product, rating)
from pyspark.mllib.recommendation import Rating
r = Rating(user = 1, product = 2, rating = 5.0)
(r[0],r[1],r[2])

In [None]:
# Train test split
data = sc.parallelize([1,2,3,4,5,6,7,8,9,10])
training, test = data.randomSplit([0.6,0.4])
training.collect()
test.collect()

In [None]:
# ALS algorithm provides collaborative filtering
ALS.train(ratings, rank, iterations)

In [None]:
# Load the data into RDD
data = sc.textFile(file_path)

# Split the RDD 
ratings = data.map(lambda l: l.split(','))

# Transform the ratings RDD 
ratings_final = ratings.map(lambda line: Rating(int(line[0]), int(line[1]), float(line[2])))

# Split the data into training and test
training_data, test_data = ratings_final.randomSplit([0.8, 0.2])

In [None]:
# Create the ALS model on the training data
model = ALS.train(training_data, rank=10, iterations=10)

# Drop the ratings column 
testdata_no_rating = test_data.map(lambda p: (p[0], p[1]))

# Predict the model  
predictions = model.predictAll(testdata_no_rating)

# Print the first rows of the RDD
predictions.take(2)

In [None]:
# Prepare ratings data
rates = ratings_final.map(lambda r: ((r[0], r[1]), r[2]))

# Prepare predictions data
preds = predictions.map(lambda r: ((r[0], r[1]), r[2]))

# Join the ratings data with predictions data
rates_and_preds = rates.join(preds)

# Calculate and print MSE
MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error of the model for the test data = {:.2f}".format(MSE))

#### Mllib - Logistic Regression

In [None]:
# Two types of vectors - Dense and Sparse vectors
denseVec = vectors.dense([1.0,2.0,3.0])
sparseVec = vectors.sparse(4,{1:1.0,3:5.5})

# A labeledPoint is a wrapper for input features and predicted values
positive = LabeledPoint(1.0,[1.0,0.0,3.0])

from pyspark.mllib.feature import HashingTF
sentence = "hello hello world"
words = sentence.split()
tf = HashingTF(10000)
tf.transform(words)

In [None]:
# Load the datasets into RDDs
spam_rdd = sc.textFile(file_path_spam)
non_spam_rdd = sc.textFile(file_path_non_spam)

# Split the email messages into words
spam_words = spam_rdd.flatMap(lambda email: email.split(' '))
non_spam_words = non_spam_rdd.flatMap(lambda email: email.split(' '))

# Print the first element in the split RDD
print("The first element in spam_words is", spam_words.first())
print("The first element in non_spam_words is", non_spam_words.first())

In [None]:
# Create a HashingTf instance with 200 features
tf = HashingTF(numFeatures=200)

# Map each word to one feature
spam_features = tf.transform(spam_words)
non_spam_features = tf.transform(non_spam_words)

# Label the features: 1 for spam, 0 for non-spam
spam_samples = spam_features.map(lambda features:LabeledPoint(1, features))
non_spam_samples = non_spam_features.map(lambda features:LabeledPoint(0, features))

# Combine the two datasets
samples = spam_samples.join(non_spam_samples)

In [None]:
# Split the data into training and testing
train_samples,test_samples = samples.randomSplit([0.8, 0.2])

# Train the model
model = LogisticRegressionWithLBFGS.train(train_samples)

# Create a prediction label from the test data
predictions = model.predict(test_samples.map(lambda x: x.features))

# Combine original labels with the predicted labels
labels_and_preds = test_samples.map(lambda x: x.label).zip(predictions)

# Check the accuracy of the model on the test data
accuracy = labels_and_preds.filter(lambda x: x[0] == x[1]).count() / float(test_samples.count())
print("Model accuracy : {:.2f}".format(accuracy))

#### Mllib - Clustering

In [None]:
+ K-means, Gaussian Mixture, Power iteration clustering (PIC)
+ Bisecting K-means, Streaming K-means

In [None]:
from pyspark.mllib.clustering import KMeans
model = KMeans.train(RDD, k = 2, MaxIterations = 10)
model.clusterCenters

In [None]:
# Evaluate
from math import sqrt
def error(point):
    center = model.centers[model.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

In [None]:
wssse = RDD.map(lambda point:error(point)).reduce(lambda x, y: x + y)
print("within set sum of squared error = " + str(wssse))

In [None]:
# Load the dataset into a RDD
clusterRDD = sc.textFile(file_path)

# Split the RDD based on tab
rdd_split = clusterRDD.map(lambda x: x.split('\t'))

# Transform the split RDD by creating a list of integers
rdd_split_int = rdd_split.map(lambda x: [int(x[0]), int(x[1])])

# Count the number of rows in RDD 
print("There are {} rows in the rdd_split_int dataset".format(rdd_split_int.count()))

In [None]:
# Train the model with clusters from 13 to 16 and compute WSSSE 
for clst in range(13, 17):
    model = KMeans.train(rdd_split_int, clst, seed=1)
    WSSSE = rdd_split_int.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print("The cluster {} has Within Set Sum of Squared Error {}".format(clst, WSSSE))

# Train the model again with the best k 
model = KMeans.train(rdd_split_int, k=15, seed=1)

# Get cluster centers
cluster_centers = model.clusterCenters

In [None]:
# Convert rdd_split_int RDD into Spark DataFrame
rdd_split_int_df = spark.createDataFrame(rdd_split_int, schema=["col1", "col2"])

# Convert Spark DataFrame into Pandas DataFrame
rdd_split_int_df_pandas = rdd_split_int_df.toPandas()

# Convert "cluster_centers" that you generated earlier into Pandas DataFrame
cluster_centers_pandas = pd.DataFrame(cluster_centers, columns=["col1", "col2"])

# Create an overlaid scatter plot
plt.scatter(rdd_split_int_df_pandas["col1"], rdd_split_int_df_pandas["col2"])
plt.scatter(cluster_centers_pandas["col1"], cluster_centers_pandas["col2"], color="red", marker="x")
plt.show()