### Generalized Linear Regression
##### Contrasted with linear regression where the output is assumed to follow a Gaussian distribution, generalized linear models (GLMs) are specifications of linear models where the response variable Y, follows some distribution from the exponential family of distributions. Spark's GeneralizedLinearRegression interface allows for flexible specification of GLMs which can be used for various types of predictions problems including linear regression, Poisson regression, logistic regression and others. Currently in sparks.ml, only a subset of the exponential family distributions are supported. <b>Note:</b> Spark currently only supports upto 4096 features through GeneralizedLinearRegression interface and will throw an exception if the constraint is exceeded. 

<table>
    <tr>
        <th>Family</th>
        <th>Response Type</th>
        <th>Supported Links</th>
    </tr>
    <tr>
        <td>Gaussian</td>
        <td>Continous</td>
        <td>Identity*, Log, Inverse</td>
    </tr>
    <tr>
        <td>Binomial</td>
        <td>Binary</td>
        <td>Logit*, Probit, CLogLog</td>
    </tr>
    <tr>
        <td>Poisson</td>
        <td>Count</td>
        <td>Log*, identity, Sqrt</td>
    </tr>
        <tr>
        <td>Gamma</td>
        <td>Continuous</td>
        <td>Inverse*, Identity, Log</td>
    </tr>
        <tr>
        <td>Tweedie</td>
        <td>Zero-inflated continous</td>
        <td>Power link function</td>
    </tr>
</table>


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import GeneralizedLinearRegression

In [None]:
spark = SparkSession.builder.appName("Generalized Linear Regression").getOrCreate()
data = spark.read.csv("Data/Car_Sales.csv", header = True, inferSchema = True)
data.show()

In [None]:
data.printSchema()

In [None]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType


def OneHotEncoding(df, col_name=""):
    ###Gather the distinct values
    distinct_values = list(df.select(col_name).distinct().toPandas()[col_name])
    # for each of the gathered values create a new column
#For Body
    for distinct_value in distinct_values:
        function = udf(lambda item: 1 if item ==
                       distinct_value else 0, IntegerType())
        new_column_name = col_name+'_'+distinct_value
        df = df.withColumn(new_column_name, function(col(col_name)))

    return df


In [None]:
data.columns

In [None]:
#Creating dummies for the columns with categorical values
cols = ['Brand', 'Body', 'Engine Type', 'Registration']

for item in cols:
    data = OneHotEncoding(data, col_name = item)

In [None]:
data.columns

In [None]:
data = data.drop("Model")

In [None]:
#Changing data types to int
data = data.withColumn("Price", data['Price'].cast('double'))
data = data.withColumn("EngineV", data['EngineV'].cast('double'))
data.printSchema()

In [None]:
#Dropping unwanted columns
cols = ['Brand', 'Body', 'Engine Type', 'Registration']
for item in cols:
    data = data.drop(item)

data.columns

In [None]:
data.printSchema()

In [None]:
data.columns

In [None]:
#Feature assembler
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['Mileage',
                                                'EngineV',
                                                'Year',
                                                'Brand_Volkswagen',
                                                'Brand_Mitsubishi',
                                                'Brand_Audi',
                                                'Brand_Mercedes-Benz',
                                                'Brand_Renault',
                                                'Brand_BMW',
                                                'Brand_Toyota',
                                                'Body_van',
                                                'Body_crossover',
                                                'Body_other',
                                                'Body_sedan',
                                                'Body_hatch',
                                                'Body_vagon',
                                                'Engine Type_Diesel',
                                                'Engine Type_Other',
                                                'Engine Type_Gas',
                                                'Engine Type_Petrol',
                                                'Registration_no',
                                                'Registration_yes'], outputCol = 'features')
finalized_data = featureassembler.transform(data)
finalized_data.columns

In [None]:
#Selecting feature for training with labels
finalized_data = finalized_data.select("features", "Price")
finalized_data.columns

In [19]:
#Splitting data
(train, test) = finalized_data.randomSplit([0.7, 0.3], 102)

In [None]:
#Model Training
glr = GeneralizedLinearRegression(featuresCol = "features", labelCol = "Price",family = "gaussian", link = "identity", maxIter = 10, regParam = 0.3)
model = glr.fit(train)

In [None]:
# Intecept and Coefficients
print(f"Coefficients: {str(model.coefficients)}")
print(f"Intercept: {str(model.intercept)}")

In [None]:
#summary
summary = model.summary
print(f"Coefficient Standard Errors: {str(summary.coefficientstandardErrors)}")
print(f"r Values: {str(summary.tvalues)}")
print(f"p values: {str(summary.pvalues)}")
print(f"Dispersion: {str(summary.dispersion)}")
print(f"Null Deviance: {str(summary.nullDeviance)}")
print(
    f"Residual Degree of Freedom Null: {str(summary.residualDegreeOfFreedomNull)}"
)
print(f"Deviance: {str(summary.deviance)}")
print(f"AIC: {str(summary.aic)}")
print("Deviance Residuals: ")
summary.residuals().show()