# Lab 4 - Pitching Aggregation

## Setup

#### `pandas`

In [1]:
import pandas as pd
from dfply import *
pitching_raw = pd.read_csv('./data/baseball/core/Pitching.csv') 
pitching = (pitching_raw >>
             mutate(id = pitching_raw.index))
pitching.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,id
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,7,,0,146.0,0,42,,,,0
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,7,,0,1291.0,0,292,,,,1
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,2,,0,14.0,0,9,,,,2
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,20,,0,1080.0,1,257,,,,3
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,0,,0,57.0,0,21,,,,4


#### `sqlalchemy`

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import select as select_sql

pitching_eng = create_engine("sqlite:///databases/baseball_2_5.db") 
Base = automap_base()
Base.prepare(pitching_eng, reflect=True)
Pitching = Base.classes.pitching
Pitching

sqlalchemy.ext.automap.pitching

In [3]:
stmt = select_sql('*').select_from(Pitching).limit(5)
pd.read_sql_query(stmt, con=pitching_eng)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,id
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,7,,0,146.0,0,42,,,,0
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,7,,0,1291.0,0,292,,,,1
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,2,,0,14.0,0,9,,,,2
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,20,,0,1080.0,1,257,,,,3
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,0,,0,57.0,0,21,,,,4


#### `pyspark`

In [4]:
from pyspark.sql import SparkSession
from more_pyspark import get_spark_types, to_pandas

spark = SparkSession.builder.appName('Ops').getOrCreate()

schema = get_spark_types(pitching, keys=['id'])

pitching_spark = spark.createDataFrame(pitching, schema=schema)
(pitching_spark.
   take(5)) >> to_pandas

Unnamed: 0,playerID,yearID,stint,teamID,lgID,W,L,G,GS,CG,...,WP,HBP,BK,BFP,GF,R,SH,SF,GIDP,id
0,bechtge01,1871,1,PH1,,1,2,3,3,2,...,7,,0,146.0,0,42,,,,0
1,brainas01,1871,1,WS3,,12,15,30,30,30,...,7,,0,1291.0,0,292,,,,1
2,fergubo01,1871,1,NY2,,0,0,1,0,0,...,2,,0,14.0,0,9,,,,2
3,fishech01,1871,1,RC1,,4,16,24,24,22,...,20,,0,1080.0,1,257,,,,3
4,fleetfr01,1871,1,NY2,,0,1,1,1,1,...,0,,0,57.0,0,21,,,,4


## Task 1

Compute and plot the average home runs per game allowed per year for all years in the pitching table.  To accomplish this task, you should

1. Aggregate total `HR` and `IPouts` for each year.
1. Create a `games_pitched` column, which is computed by dividing total inning pitched outs `IPouts` by 27, the number of outs in a game.
2. Create a `hr_per_game` column by dividing the total `HR` for each year by `games_pitched` for each year.
4. Make line plot of the results. You will need to create a new column `HR_per_game` then perform the group and aggregate. You can use [seaborn's lineplot](https://seaborn.pydata.org/generated/seaborn.lineplot.html) to make the graph.

Solve this problem in each framework.

## <font color="red"> Problem 1 </font>

Explain why we might want to total the home runs and outs pitched before dividing.

> *Your thoughts here*

## <font color="red"> Problem 2 </font>

Complete the above tasks using `pandas` and `dfply`

In [7]:
hr_per_year = (pitching
                >> select(X.yearID, X.HR, X.IPouts)
                >> group_by(X.yearID)
                >> summarise(total_hr = X.HR.sum(),
                             total_ipouts = X.IPouts.sum())
                >> mutate(games_pitched = X.total_ipouts/27)
                >> mutate(hr_per_game = X.total_hr/X.games_pitched))
hr_per_year.head()

Unnamed: 0,yearID,total_hr,total_ipouts,games_pitched,hr_per_game
0,1871,47,6750,250.0,0.188
1,1872,37,9858,365.111111,0.101339
2,1873,47,10754,398.296296,0.118003
3,1874,40,12509,463.296296,0.086338
4,1875,40,18571,687.814815,0.058155


In [8]:
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline
sns.lineplot(x = 'yearID', y= 'hr_per_game', data=hr_per_game)

NameError: name 'hr_per_game' is not defined

## <font color="red"> Problem 3 </font>

Complete the above tasks using `sqlalchemy`

In [16]:
from sqlalchemy.sql import select as select_sql
from sqlalchemy import func
from more_sqlalchemy import pprint
totals = (select_sql([Pitching.yearID,
                     func.sum(Pitching.HR).label('total_hr'),
                     func.sum(Pitching.IPouts).label('total_ipouts'),
                     ])
         .group_by(Pitching.yearID)
         .alias('tot')
         )
pprint(totals)

SELECT pitching."yearID",
       sum(pitching."HR") AS total_hr,
       sum(pitching."IPouts") AS total_ipouts
FROM pitching
GROUP BY pitching."yearID"


In [19]:
from sqlalchemy import text
game_pitched = (select_sql([totals.c.yearID,
                           totals.c.total_ipouts,
                           totals.c.total_hr,
                           (totals.c.total_ipouts/27).label('game_pitched'),
                           ])
               .select_from(totals)
               .alias('games_pitched')
               )
pprint(game_pitched)

SELECT tot."yearID",
       tot.total_ipouts,
       tot.total_hr,
       tot.total_ipouts / :total_ipouts_1 AS game_pitched
FROM
  (SELECT pitching."yearID" AS "yearID",
          sum(pitching."HR") AS total_hr,
          sum(pitching."IPouts") AS total_ipouts
   FROM pitching
   GROUP BY pitching."yearID") AS tot


In [None]:
from sqlalchemy import cast, Float
ratios = (select_sql([game_pitched.c.yearID,
                     cast(game_pitched.c.total_hr, Float)/game_pitched.c.games_pitched).label('hr_per_game')
                     ])

In [None]:
import seaborn as sns
import matplotlib.pylab as plt


## <font color="red"> Problem 4 </font>

Complete the above tasks using `pyspark`

In [263]:
# Your code here

## <font color="blue"> Key for Problem 4</font>

## Task 2

For each year, determine the team that had the most home runs (using the `Pitching.csv` file).

## <font color="red"> Problem 5 </font>

Solve **Task 2** with the framework of your choice.

In [263]:
# Your code here