In [2]:
# Import necessary libraries.
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('regression').getOrCreate()

from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler


import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['figure.figsize'] = 8, 5
plt.rcParams['image.cmap'] = 'viridis'
import pandas as pd

In [None]:
# Create dataFrame_Initial from VideoGamesSales.csv file.
dataFrame_Initial = spark.read.csv('../Datasets/VideoGamesSales.csv', header=True, inferSchema=True)

In [None]:
# Data point count for dataFrame_Initial.
print("Total data points:", dataFrame_Initial.count())

In [None]:
# Schema table for dataFrame_Initial.
dataFrame_Initial.printSchema()

In [None]:
# General statistics for 'Year_of_Release'
dataFrame_Initial.select('Year_of_Release').describe().show()

In [None]:
dataFrame_Initial.filter("Year_of_Release > 2000 AND Year_of_Release < 2017").select('Year_of_Release').count()

In [None]:
# Filter dataFrame_Initial to remove empty values.
dataFrame_Filtered = dataFrame_Initial.na.drop()

# Filter dataFrame_Filtered by 'Year_of_Release'
dataFrame_Filtered = dataFrame_Filtered.orderBy('Year_of_Release')

# Check dataFrame has been filtered. 
dataFrame_Filtered.head(5)

In [None]:
# Data point count for dataFrame_Filtered.
print("Total data points:", dataFrame_Filtered.count())

In [None]:
# Add key column, 'ID' to dataFrame.
dataFrame_wKey = dataFrame_Filtered.select('*').withColumn('ID', monotonically_increasing_id())

In [None]:
# Set column types to accurately reflect data.
dataFrame_wKey = dataFrame_wKey.withColumn('User_Score', dataFrame_wKey['User_Score'].cast('float'))
dataFrame_wKey = dataFrame_wKey.withColumn('Year_of_Release', dataFrame_wKey['Year_of_Release'].cast('int'))
dataFrame_wKey = dataFrame_wKey.withColumn('User_Count', dataFrame_wKey['User_Count'].cast('int'))
dataFrame_wKey = dataFrame_wKey.withColumn('Critic_Count', dataFrame_wKey['Critic_Count'].cast('int'))

In [None]:
# Schema table with columns set to correct data type.
dataFrame_wKey.printSchema()

In [None]:
# Create list of columns deemed useful.
columns_Useful = ['ID', 'Name', 'Platform', 'Year_of_Release', 'Genre', 
               'Global_Sales', 'Critic_Score', 'Critic_Count',
               'User_Score', 'User_Count']

In [None]:
# Create new dataframe which contains only useful columns.
dataFrame_Useful = dataFrame_wKey[columns_Useful]

In [None]:
# Show first 5 rows of dataFrame_Useful
dataFrame_Useful.head(5)

In [None]:
pd_df_Useful = dataFrame_Useful.describe().toPandas().transpose()

In [None]:
# Create list of columns for input.
input_Columns = ['ID', 'Year_of_Release', 'Critic_Score', 'User_Score']

vector_Assembler = VectorAssembler(inputCols = input_Columns, outputCol = 'Features')

# Transform the data.
vector_Output = vector_Assembler.transform(dataFrame_Useful)

# Schema table with Features column added.
vector_Output.printSchema()

In [None]:
# Create new dataframe with only Features and GlobalSales columns
vector_output = vector_Output.select(['Features','Global_Sales'])

# dataFrame_Features now has only 2 columns
print(vector_output.head(5))

In [None]:
# Split data by amounts stated above
data_train,data_test = vector_output.randomSplit([0.7,0.3])

# Show data_train
data_train.show()

# Show data_test
data_test.show()


In [None]:
from pyspark.ml.regression import LinearRegression

regression = LinearRegression(featuresCol='Features', labelCol='Global_Sales')

# Fit the training data.
regression_Model = regression.fit(data_train)

# Print the coefficients and intercept.
#print("Liner Regression Coefficients: " + str(regression_Model.coefficients))
#print("Linear Regression Intercept: " + str(regression_Model.intercept))

# Summarise the model
#summary = regression_Model.summary

# Print RMSE and R2
#print("Linear Regression RMSE on training data: " + str(summary.rootMeanSquaredError))
#print("Linear Regression R2 on training data: " + str(summary.r2))

In [22]:
# Visualize the coefficients.
beta = np.sort(regression_Model.coefficient)

# Initial plot of data.
plt.plot(beta)

# Add a label to y-axis of plot.
plt.ylabel('Beta Coefficients')

NameError: name 'regression_Model' is not defined

In [None]:
results_Test = regression_Model.evaluate(data_Test)

# Print RMSE and R2
print("Linear Regression RMSE on test data: " + str(results_test.rootMeanSquaredError))
print("Linear Regression R2 on test data: " + str(results_Test.r2))