# Joins in `pyspark`

Performed with `df_left.join(df_right, how=type_str)`

In [1]:
from pyspark.sql import SparkSession
from more_pyspark import to_pandas
spark = SparkSession.builder.appName('Ops').getOrCreate()
deptk = spark.read.csv("./data/department.csv",  header=True, inferSchema=True)
deptk.collect() >> to_pandas

Unnamed: 0,DeptID,DeptName
0,31,Sales
1,33,Engineering
2,34,Clerical
3,35,Marketing


In [2]:
emplk = spark.read.csv("./data/employee.csv",  header=True, inferSchema=True)
emplk.collect() >> to_pandas

Unnamed: 0,DeptID,LastName
0,31.0,Rafferty
1,33.0,Jones
2,33.0,Heisenberg
3,34.0,Robinson
4,34.0,Smith
5,,Williams


#### Inner join

In [3]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='inner')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Jones,Engineering
2,33,Heisenberg,Engineering
3,34,Robinson,Clerical
4,34,Smith,Clerical


#### Left join

In [4]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='left')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31.0,Rafferty,Sales
1,33.0,Jones,Engineering
2,33.0,Heisenberg,Engineering
3,34.0,Robinson,Clerical
4,34.0,Smith,Clerical
5,,Williams,


#### Right join

In [5]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='right')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,31,Rafferty,Sales
1,33,Heisenberg,Engineering
2,33,Jones,Engineering
3,34,Smith,Clerical
4,34,Robinson,Clerical
5,35,,Marketing


#### Outer join

In [6]:
(emplk.join(deptk, emplk.DeptID == deptk.DeptID, how='outer')
 .collect()) >> to_pandas

Unnamed: 0,DeptID,LastName,DeptName
0,,Williams,
1,35.0,,Marketing
2,34.0,Robinson,Clerical
3,34.0,Smith,Clerical
4,31.0,Rafferty,Sales
5,33.0,Jones,Engineering
6,33.0,Heisenberg,Engineering


## <font color="red"> Exercise 2 </font>

Determine all the players that have hit more than 50 home runs in a season.  The final table should include the players proper name, as well as the team name.  

**Hint:** You will need join the files listed below.  To get credit for this exercise, use the join `pyspark` join methods presented above.

In [7]:
import pandas as pd
from dfply import *
f1, f2, f3 = ("./data/baseball/core/Batting.csv", 
              "./data/baseball/core/People.csv",
              "./data/baseball/core/Teams.csv")
batting_raw = pd.read_csv(f1)
people_raw = pd.read_csv(f2)
teams_raw = pd.read_csv(f3)
batting = (batting_raw >>
          mutate(id = batting_raw.index))
people = (people_raw >>
         mutate(id = people_raw.index))
teams = (teams_raw >>
        mutate(id = teams_raw.index))


In [8]:
from pyspark.sql import SparkSession
from more_pyspark import get_spark_types, to_pandas

spark = SparkSession.builder.appName('Ops').getOrCreate()



In [10]:
batting.take(5) >> to_pandas

TypeError: object of type 'numpy.int64' has no len()

In [None]:
people.take(5) >> to_pandas

In [None]:
teams.take(5) >> to_pandas

In [None]:
import pyspark.sql.functions as fn
import pyspark.sql.functions as fn
from pyspark.sql.functions import col, isnan
(batting
 .select([batting.yearID, 
         batting.teamID, 
         batting.HR,
         batting.playerID])
 #.groupby(batting.playerID, 
 #          batting.teamID)
 #.agg(fn.sum(pitching_spark.W).alias('total_wins'))
 .where(fn.batting.HRS > 50)
 #.orderBy(pitching_spark.yearID, fn.col('total_wins'), ascending=False)
 .collect()
) >> to_pandas >> head

## Up Next

Stuff