## Step 1: Create spark context

In [1]:
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
spark

In [3]:
sc

## Step 2: Create a pandas dataframe

In [4]:
import pandas as pd
p_df = pd.read_csv('data/users.csv', sep='|')

In [6]:
p_df.head()

Unnamed: 0,1,24,M,technician,85711
0,2,53,F,other,94043
1,3,23,M,writer,32067
2,4,24,M,technician,43537
3,5,33,F,other,15213
4,6,42,M,executive,98101


## Step 3: Create spark dataframe

In [7]:
rdd = sc.textFile("data/users.csv").map(lambda line: line.split("|"))

In [8]:
rdd.take(5)

[['1', '24', 'M', 'technician', '85711'],
 ['2', '53', 'F', 'other', '94043'],
 ['3', '23', 'M', 'writer', '32067'],
 ['4', '24', 'M', 'technician', '43537'],
 ['5', '33', 'F', 'other', '15213']]

In [None]:
df = spark.read.csv("data/users.csv", sep="|")

In [None]:
type(df)

In [None]:
# it's already a DF, but this is the easy way to rename columns
df = (spark.read.csv("data/users.csv", sep="|")
           .toDF("id", "age", "gender", "occupation", "zip"))

In [None]:
(
    df.where("occupation != 'other'")
      .groupby("occupation")
      .count()
      .sort("count", ascending=0)
      .show()
)

In [None]:
df.persist()

In [None]:
df.limit(5).toPandas()
#df.head()
#df.show(5)

In [None]:
def show(df, n=5):
    return df.limit(n).toPandas()

In [None]:
import pyspark.sql.functions as F
df.agg(F.countDistinct('occupation')).show()

In [None]:
query = """
SELECT occupation, COUNT(*) as count
FROM users
GROUP BY occupation
ORDER BY count DESC
"""
df.createOrReplaceTempView('users')
output = spark.sql(query)
output.show(10)