In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
#spark = SparkSession.builder.master("local[*]").getOrCreate()
spark = SparkSession.builder.appName('Basic').getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import LinearRegressionModel
import pandas as pd

In [None]:
try: 
    path = input("File path: ")
    filetype = input("File type: ")
    df = spark.read.load(path, format=filetype, inferSchema=True, header=True)
except(ValueError, FileNotFoundError):
    print("You did not enter valid inputs.")

In [None]:
df.printSchema()

In [None]:
df.head()

In [None]:
df.columns

In [None]:
num_features = input("Number of features: ")

In [None]:
coef_var = []
for i in range(int(num_features)):
    coef_var.append(input("Feature name: "))
assembler = VectorAssembler(inputCols=coef_var, outputCol='features')

In [None]:
output = assembler.setHandleInvalid('skip').transform(df) 
#setHandleInvalid skip flag lets it skip null rows instead of having an error

In [None]:
output_col = input("Output column: ")

In [None]:
final_df = output.select('features', output_col)

In [None]:
train_data, test_data = final_df.randomSplit([0.7,0.3])

In [None]:
lm = LinearRegression(labelCol=output_col)

In [None]:
model = lm.fit(train_data)

In [None]:
pd.DataFrame({"Coefficients":model.coefficients}, index=coef_var)

In [None]:
res = model.evaluate(test_data)

In [None]:
res.residuals.show()

In [None]:
unlabeled_data = test_data.select('features')

In [None]:
predictions = model.transform(unlabeled_data)

In [None]:
predictions.show()

In [None]:
print("MAE: ", res.meanAbsoluteError)
print("MSE: ", res.meanSquaredError)
print("RMSE: ", res.rootMeanSquaredError)
print("R2: ", res.r2)
print("Adj R2: ", res.r2adj)

In [None]:
model.save("model")