In [111]:
# Import necessary libraries.
import findspark
findspark.init('/home/user/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('basics').getOrCreate()

from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import col

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = 8, 5
plt.rcParams['image.cmap'] = 'viridis'
import pandas as pd

In [112]:
# Create dataFrame_Initial from VideoGamesSales.csv file.
dataFrame_Initial = spark.read.csv('../Datasets/VideoGamesSales.csv', header=True, inferSchema=True)

In [113]:
# Data point count for dataFrame_Initial.
print("Total data points:", dataFrame_Initial.count())

Total data points: 16719


In [114]:
# Filter dataFrame_Initial to remove empty values.
dataFrame_Filtered = dataFrame_Initial.na.drop()

In [115]:
# Data point count for dataFrame_Filtered.
print("Total data points:", dataFrame_Filtered.count())

Total data points: 6947


In [116]:
# Info table for dataFrame_Initial.
dataFrame_Filtered.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year_of_Release: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: double (nullable = true)
 |-- EU_Sales: double (nullable = true)
 |-- JP_Sales: double (nullable = true)
 |-- Other_Sales: double (nullable = true)
 |-- Global_Sales: double (nullable = true)
 |-- Critic_Score: integer (nullable = true)
 |-- Critic_Count: integer (nullable = true)
 |-- User_Score: string (nullable = true)
 |-- User_Count: integer (nullable = true)
 |-- Developer: string (nullable = true)
 |-- Rating: string (nullable = true)



In [117]:
dataFrame_Filtered.select('*').withColumn('ID', monotonically_increasing_id())

DataFrame[Name: string, Platform: string, Year_of_Release: string, Genre: string, Publisher: string, NA_Sales: double, EU_Sales: double, JP_Sales: double, Other_Sales: double, Global_Sales: double, Critic_Score: int, Critic_Count: int, User_Score: string, User_Count: int, Developer: string, Rating: string, ID: bigint]

In [118]:
# Set column types to accurately reflect data.
dataFrame_Filtered = dataFrame_Filtered.withColumn('User_Score', dataFrame_Filtered['User_Score'].cast('float'))
dataFrame_Filtered = dataFrame_Filtered.withColumn('Year_of_Release', dataFrame_Filtered['Year_of_Release'].cast('int'))
dataFrame_Filtered = dataFrame_Filtered.withColumn('User_Count', dataFrame_Filtered['User_Count'].cast('int'))
dataFrame_Filtered = dataFrame_Filtered.withColumn('Critic_Count', dataFrame_Filtered['Critic_Count'].cast('int'))

In [119]:
# Info table with columns set to correct data type.
dataFrame_Filtered.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year_of_Release: integer (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: double (nullable = true)
 |-- EU_Sales: double (nullable = true)
 |-- JP_Sales: double (nullable = true)
 |-- Other_Sales: double (nullable = true)
 |-- Global_Sales: double (nullable = true)
 |-- Critic_Score: integer (nullable = true)
 |-- Critic_Count: integer (nullable = true)
 |-- User_Score: float (nullable = true)
 |-- User_Count: integer (nullable = true)
 |-- Developer: string (nullable = true)
 |-- Rating: string (nullable = true)



In [120]:
# Create list of columns deemed useful.
columns_Useful = ['Name', 'Platform', 'Year_of_Release', 'Genre', 
               'Global_Sales', 'Critic_Score', 'Critic_Count',
               'User_Score', 'User_Count', 'Rating']

In [121]:
# Create new dataframe which contains only useful columns.
dataFrame_Useful = dataFrame_Filtered[columns_Useful]

In [122]:
# Show first 10 rows of dataFrame_Useful
dataFrame_Useful.head(10)

[Row(Name='Wii Sports', Platform='Wii', Year_of_Release=2006, Genre='Sports', Global_Sales=82.53, Critic_Score=76, Critic_Count=51, User_Score=8.0, User_Count=322, Rating='E'),
 Row(Name='Mario Kart Wii', Platform='Wii', Year_of_Release=2008, Genre='Racing', Global_Sales=35.52, Critic_Score=82, Critic_Count=73, User_Score=8.300000190734863, User_Count=709, Rating='E'),
 Row(Name='Wii Sports Resort', Platform='Wii', Year_of_Release=2009, Genre='Sports', Global_Sales=32.77, Critic_Score=80, Critic_Count=73, User_Score=8.0, User_Count=192, Rating='E'),
 Row(Name='New Super Mario Bros.', Platform='DS', Year_of_Release=2006, Genre='Platform', Global_Sales=29.8, Critic_Score=89, Critic_Count=65, User_Score=8.5, User_Count=431, Rating='E'),
 Row(Name='Wii Play', Platform='Wii', Year_of_Release=2006, Genre='Misc', Global_Sales=28.92, Critic_Score=58, Critic_Count=41, User_Score=6.599999904632568, User_Count=129, Rating='E'),
 Row(Name='New Super Mario Bros. Wii', Platform='Wii', Year_of_Releas

In [123]:
pd_df_Useful = dataFrame_Useful.describe().toPandas().transpose()