## Connect to Spark standalone cluster

In [None]:
try:
    spark.stop()
except:
    print("No Spark Session")

In [None]:
import pyspark
import pymssql
import pandas as pd
from pyspark import SparkContext, SparkConf, pandas as ps
from pyspark.sql import SparkSession


In [None]:
!/usr/local/spark/sbin/stop-all.sh

In [None]:
!/usr/local/spark/sbin/start-all.sh

In [None]:
!jps

In [None]:
spark = SparkSession.builder \
        .appName("Southbridge Analytics") \
        .master("spark://sparkc:7077") \
        .getOrCreate()

## Connect to SQL Server

In [None]:
server = 'sql'
user = 'sa'
password = 'P@ssw0rd'
database = 'adworks'

conn = pymssql.connect(server, user, password, database)  

sql1 = """
SELECT c.CustomerID, c.CompanyName,COUNT(soh.SalesOrderID) AS OrderCount 
FROM SalesLT.Customer AS c LEFT OUTER JOIN SalesLT.SalesOrderHeader AS soh 
ON c.CustomerID = soh.CustomerID 
GROUP BY c.CustomerID, c.CompanyName 
ORDER BY OrderCount DESC;
"""

sql2 = """
SELECT c.CompanyName, a.AddressLine1, ISNULL(a.AddressLine2, '') AS AddressLine2,
a.City, a.StateProvince, a.PostalCode, a.CountryRegion, oh.SalesOrderID, oh.TotalDue
FROM SalesLT.Customer AS c
JOIN SalesLT.SalesOrderHeader AS oh
ON oh.CustomerID = c.CustomerID
JOIN SalesLT.CustomerAddress AS ca
ON c.CustomerID = ca.CustomerID AND AddressType = 'Main Office'
JOIN SalesLT.Address AS a
ON ca.AddressID = a.AddressID;
"""



## Query SQL and convert to DataFrame

In [None]:
# convert to DF
df1 = pd.read_sql(sql1, conn)
df2 = pd.read_sql(sql2, conn)

spdf1 = spark.createDataFrame(df1)
spdf2 = spark.createDataFrame(df2)

spdf1.show(5)
spdf2.show(5)

## Write DataFrame to HDFS as Parquet

In [None]:
import os
# create dir in hdfs if not already there
os.system('hdfs dfs -mkdir hdfs://localhost:9000/sql-spoke/')

In [None]:
spdf1.write.parquet("hdfs://localhost:9000/sql-spoke/sql1.parquet")
spdf2.write.parquet("hdfs://localhost:9000/sql-spoke/sql2.parquet")

In [None]:
testdf1 = spark.read.parquet("hdfs://localhost:9000/sql-spoke/sql1.parquet")
testdf2 = spark.read.parquet("hdfs://localhost:9000/sql-spoke/sql2.parquet")

In [None]:
testdf1.show(2)
testdf2.show(2)

In [None]:
spark.stop()

In [1]:
import os
import pyspark
import pymssql
import pandas as pd
from pyspark import SparkContext, SparkConf, pandas as ps
from pyspark.sql import SparkSession

try:
    spark.stop()
except:
    print("No Spark Session")

sparkClassPath =  '/usr/local/spark/jars/sqljdbc42.jar'

spark = SparkSession.builder \
        .config("spark.driver.extraClassPath", sparkClassPath) \
        .config("spark.jars", sparkClassPath) \
        .appName("Southbridge Analytics") \
        .master("local") \
        .getOrCreate()





No Spark Session
22/08/10 19:56:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
server = "sql"
database = "adworks"
table = "SalesLT.Customer"
user = "sa"
password  = "P@ssw0rd"
 
#read table data into a spark dataframe
jdbcDF = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()


In [4]:
jdbcDF.show()

+----------+---------+-----+-----------+----------+----------+------+--------------------+--------------------+--------------------+-------------------+--------------------+------------+--------------------+-------------------+
|CustomerID|NameStyle|Title|  FirstName|MiddleName|  LastName|Suffix|         CompanyName|         SalesPerson|        EmailAddress|              Phone|        PasswordHash|PasswordSalt|             rowguid|       ModifiedDate|
+----------+---------+-----+-----------+----------+----------+------+--------------------+--------------------+--------------------+-------------------+--------------------+------------+--------------------+-------------------+
|         1|    false|  Mr.|    Orlando|        N.|       Gee|  null|        A Bike Store|adventure-works\p...|orlando0@adventur...|       245-555-0173|L/Rlwxzp4w7RWmEgX...|    1KjXYs4=|3F5AE95E-B87D-4AE...|2005-08-01 00:00:00|
|         2|    false|  Mr.|      Keith|      null|    Harris|  null|  Progressive Sport