# Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import GeneralizedLinearRegression

#from pyspark import SparkConf, SparkContext
#sc = SparkContext(conf=SparkConf().set("spark.files.overwrite", "true"))

In [2]:
spark = SparkSession\
        .builder\
        .appName("GeneralizedLinearRegressionExample")\
        .getOrCreate()

22/09/23 19:29:58 WARN Utils: Your hostname, cesar-GL62M-7RDX resolves to a loopback address: 127.0.1.1; using 10.22.162.210 instead (on interface wlp2s0)
22/09/23 19:29:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/23 19:29:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# User-Defined functions

# Data

In [3]:
def spark_from_url(url, num_features = False):
    
    file_name = url[url.rfind('/')+1:]
    
    from pyspark import SparkFiles
    spark.sparkContext.addFile(url)
    
    if num_features !=False:
        dataset = spark.read.format("libsvm").option("numFeatures", num_features).load("file://"+SparkFiles.get(file_name))
    else:
        dataset = spark.read.format("libsvm").load("file://"+SparkFiles.get(file_name))
    
    return dataset

# Test:
if False:
    dataset = spark_from_url(url, 10)

# Linear regression examples

In [4]:
url = "https://raw.githubusercontent.com/apache/spark/master/data/mllib/sample_linear_regression_data.txt"
dataset = spark_from_url(url, 10)

In [5]:
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model = glr.fit(dataset)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients) + "\n")
print("Intercept: " + str(model.intercept) + "\n")

# Summarize the model over the training set and print out some metrics
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors) + "\n")
print("T Values: " + str(summary.tValues) + "\n")
print("P Values: " + str(summary.pValues) + "\n")
print("Dispersion: " + str(summary.dispersion) + "\n")
print("Null Deviance: " + str(summary.nullDeviance) + "\n")
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull) + "\n")
print("Deviance: " + str(summary.deviance) + "\n")
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom) + "\n")
print("AIC: " + str(summary.aic) + "\n")
print("Deviance Residuals: ")
summary.residuals().show()

spark.stop()

                                                                                

Coefficients: [0.010541828081257209,0.800325310056095,-0.7845165541420372,2.3679887171421914,0.5010002089857577,1.1222351159753023,-0.2926824398623297,-0.49837174323213046,-0.6035797180675657,0.6725550067187459]

Intercept: 0.14592176145232047

Coefficient Standard Errors: [0.7950428434287478, 0.8049713176546897, 0.7975916824772489, 0.831264924765992, 0.7945436200517938, 0.8118992572197593, 0.7919506385542777, 0.7973378214726764, 0.8300714999626418, 0.7771333489686802, 0.463930109648428]

T Values: [0.013259446542269234, 0.9942283563442595, -0.9836067393599173, 2.8486570846337584, 0.6305509179635714, 1.3822344410293548, -0.36957156874906694, -0.6250446546128239, -0.7271418403049983, 0.8654306337661118, 0.314533931765933]

P Values: [0.989426199114056, 0.32060241580811044, 0.3257943227369877, 0.004575078538306521, 0.5286281628105467, 0.16752945248679119, 0.7118614002322872, 0.5322327097421431, 0.467486325282384, 0.3872259825794293, 0.753249430501097]

Dispersion: 105.60988356821714

Nul



+-------------------+
|  devianceResiduals|
+-------------------+
|-10.974359174246889|
| 0.8872320138420557|
| -4.596541837478908|
|-20.411667435019638|
|-10.270419345342642|
|-6.0156058956799905|
|-10.663939415849267|
| 2.1153960525024713|
|  3.980713237913768|
|-17.225218272069533|
| -4.611647633532146|
|  6.417666940769855|
| 11.407137945300537|
| -20.70176540467664|
| -2.683748540510967|
|-16.755494794232536|
|  8.154668342638725|
|-1.4355057987358848|
|  -0.64350586881857|
|-1.1380258931683198|
+-------------------+
only showing top 20 rows



# Sources
* [Test Data](https://github.com/apache/spark/blob/master/data/mllib/sample_linear_regression_data.txt)
* [Generalized Linear Regression Example](https://github.com/apache/spark/blob/master/examples/src/main/python/ml/generalized_linear_regression_example.py)