In [None]:
# test spark out
import findspark
findspark.init('spark')
import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()

In [1]:
# import data from bucket for instance the csv file
from google.cloud import storage
from io import BytesIO
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas.io import gbq

In [2]:
client = storage.Client()
bucket = client.get_bucket('py_spark_ds')

In [3]:
blob = storage.Blob('player_info.csv', bucket)
content = blob.download_as_string()

In [4]:
pandas_df = pd.read_csv(BytesIO(content))

In [5]:
# initiate SparkContext -- step 1
# The first step in using Spark is connecting to a cluster.
# The master is connected to the rest of the computers in the cluster, which are called worker. 
# The master sends the workers data and calculations to run, and they send their results back to the master.
import findspark
findspark.init('spark')
import pyspark
import random
sc = pyspark.SparkContext(appName="player_data")

In [6]:
# verify sparkcontext and version
print(sc)
print(sc.version)

<SparkContext master=local[*] appName=player_data>
2.4.3


In [7]:
# Spark's core data structure is the Resilient Distributed Dataset (RDD). 
# This is a low level object that lets Spark work its magic by splitting data across multiple nodes in the cluster. 
# However, RDDs are hard to work with directly, so you'll be using the Spark DataFrame abstraction built on top of RDDs.
# To start working with Spark DataFrames, you first have to create a SparkSession object from your SparkContext. 
# You can think of the SparkContext as your connection to the cluster and the SparkSession as your interface with that connection.

from pyspark.sql import SparkSession
my_spark = SparkSession.builder.getOrCreate()
print(my_spark)

<pyspark.sql.session.SparkSession object at 0x7fc84f2874a8>


In [8]:
s_df = my_spark.createDataFrame(pandas_df)

In [18]:
type(s_df)

pyspark.sql.dataframe.DataFrame

In [21]:
s_df.createOrReplaceTempView("players")

In [23]:
print(my_spark.catalog.listTables())

[Table(name='players', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [36]:
# Don't change this query
query = "FROM players SELECT * LIMIT 2"

In [38]:
# run query
players2 = my_spark.sql(query)
# Show the results
players2.show()

+----------+-------+---------+------------+----------+------+------------+---+------------------+-----------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------+-----------------+------------------+-------------+------------------+-------------------+------------------+--------------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+----------------+------------------+------------+----------+---------+------+-----------+---------------+------+---------------+----------------+---------------+------------+-------------+---------+---------+-------+----+-------+------------+-----------+---------------+---------------+-------------+-----------+----------------+--------------+----------------+------------------------------+---------

In [39]:
# Don't change this query
query = "SELECT SUM(deaths), privacy FROM players GROUP BY privacy"

# Run the query
player_counts = my_spark.sql(query)

# Convert the results to a pandas DataFrame
pd_counts = player_counts.toPandas()

# Print the head of pd_counts
print(pd_counts.head())

   sum(deaths)  privacy
0      4104841  friends
1       732674       me
2      2662050      all


In [43]:
for col in pandas_df: 
    print(col) 

Unnamed: 0
privacy
score
avengerKills
bestStreak
deaths
dogtagsTaken
elo
extra.accuracy
extra.assignments
extra.assignmentsTotal
extra.gspm
extra.hkp
extra.kdr
extra.khp
extra.kpm
extra.medals
extra.medalsTotal
extra.medalsUnique
extra.ribbons
extra.ribbonsTotal
extra.ribbonsUnique
extra.ribpr
extra.roundsFinished
extra.sfpm
extra.spm
extra.unknownKills
extra.vehKillsP
extra.vehKpm
extra.vehTimeP
extra.vehicleKills
extra.vehicleTime
extra.weaKillsP
extra.weaKpm
extra.weaTimeP
extra.weaponKills
extra.weaponTime
extra.wlr
flagCaptures
flagDefend
headshots
heals
killAssists
killStreakBonus
kills
longestHeadshot
longestWinStreak
mcomDefendKills
nemesisKills
nemesisStreak
numLosses
numRounds
numWins
rank
repairs
reset.deaths
reset.kills
reset.lastReset
reset.numLosses
reset.numWins
reset.score
reset.shotsFired
reset.shotsHit
reset.timePlayed
reset.timePlayedSinceLastReset
resupplies
revives
saviorKills
shotsFired
shotsHit
skill
streak
suppressionAssists
timePlayed
vehicleDamage
vehiclesDest