# **MACHINE LEARNING MODEL**

__Machine learning model building experiment__

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('data_processing').getOrCreate()

In [5]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [6]:
df=spark.read.csv("/FileStore/tables/Advertising.csv",header=True,inferSchema=True)

In [7]:
df.count() # There is 200 rows in advertising data!

In [8]:
len(df.columns) # There are 5 total columns in dataframe

In [9]:
df.show(10) #Showing top 10 of the data

In [10]:
df2 = df.select("TV","Radio","Newspaper","Sales") # Remove just column that counts rows linearly 

In [11]:
df2.show(10) #What it looks like now that we removed first column

In [12]:
df2.summary().show() # showing summary values 

In [13]:
#Column selection
df2.select(["TV","Radio"]).show()

In [14]:
df3 = df2.filter(df2["Sales"] > 10).agg(F.avg(df2["TV"])).show() # average tv aggregating sales over 10

In [15]:
# Filtering for sales over 15 and tv over 50
df2.filter(df2["Sales"] > 15).filter(df2["TV"] > 50).show()

In [16]:
# Making a new column based on condition 
df3 = df2.withColumn('HighSales', f.when(f.col('Sales') > 15, "Yes").otherwise("No")).show()

In [17]:
df3 =df2.withColumn('HighSales', f.when(f.col('Sales') > 15, "Yes").otherwise("No"))
df3.show()

In [18]:
# groupby 
df3.groupBy("HighSales").count().show()

In [19]:
for col in df3.columns:
  if col == 'Sales':
    print("Aggregation for {}".format(col))
    df.groupBy(col).count().orderBy('count', ascending=False).show(truncate = False)

In [20]:
df3.groupby('HighSales').agg(F.mean("TV")).show()

#STATS AND ML DEMO

In [22]:
spark=SparkSession.builder.appName('basic_stats').getOrCreate() #Load stats

In [23]:
from pyspark.ml.feature import VectorAssembler
# upload this for correlation

In [24]:
assembler = VectorAssembler(inputCols=df2.columns,outputCol="features")
# upload this for correlation

In [25]:
df_new=assembler.transform(df2) # get dense vecter representation for correlation
df_new.show()

In [26]:
#Pearson Coefficient of Correlation
from pyspark.ml.stat import Correlation
pearson_corr = Correlation.corr(df_new,'features')
pearson_corr.show(2,False)

In [27]:
# IMPORT FOR LINEAR REGRESSION!
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [28]:
df2.columns # read column names

In [29]:
df2 = df2.withColumnRenamed("Sales", "label") # rename column for prediction
df2.show()

In [30]:
vec_assmebler=VectorAssembler(inputCols=['TV',
'Radio', 'Newspaper', 'label'],outputCol='features')

In [31]:
features_df=vec_assmebler.transform(df2)

In [32]:
features_df.printSchema()

In [33]:
features_df.select(['features','label']).show()

In [34]:
# splitting data for prediciton
train, test = features_df.randomSplit([0.75, 0.25])

In [35]:
print(f"Size of train Dataset : {train.count()}" )

In [36]:
print(f"Size of test Dataset : {test.count()}" )

In [37]:
# Building Linear Regression
from pyspark.ml.regression import LinearRegression

In [38]:
lr = LinearRegression()

In [39]:
lr_model = lr.fit(train)

In [40]:
predictions_df=lr_model.transform(test)
predictions_df.show()

In [41]:
model_predictions=lr_model.evaluate(test)
model_predictions.r2


In [42]:
print(model_predictions.meanSquaredError)|