In this exercise, we will look at how to slide and dice data using Pandas.

In [68]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/movielens/movies.dat', sep='::', names=['index', "movie", "genre"], index_col=0,engine='python')
df.head()

Unnamed: 0_level_0,movie,genre
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


## iloc : Position based accessor

We will look at Pandas accessor iloc which uses integer indexes 

In [3]:
# We have set index as a numeric value, so we can access any specific row as follows
print("1st record in our list is:\n")
df.iloc[0]

1st record in our list is:



movie               Toy Story (1995)
genre    Animation|Children's|Comedy
Name: 1, dtype: object

In [4]:
#If we want to access the genre of a specific movie we also mention the column index 
df.iloc[3,1]

'Comedy|Drama'

In [5]:
# We can split the genre as a list and assign it back to the dataframe 
df.genre = df.genre.str.split("|")
df.head()

Unnamed: 0_level_0,movie,genre
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),"[Animation, Children's, Comedy]"
2,Jumanji (1995),"[Adventure, Children's, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama]"
5,Father of the Bride Part II (1995),[Comedy]


In [6]:
#Suppose we know we need genre information for movies 2 to 5. We can do this as follows
df.iloc[1:5,1]

index
2    [Adventure, Children's, Fantasy]
3                   [Comedy, Romance]
4                     [Comedy, Drama]
5                            [Comedy]
Name: genre, dtype: object

## loc : Label based accessor
Now we will look at label based accessor loc

### Let's load a dataset for which we can use label based index

In [25]:
# We will load the 2017 English Premier League data (http://www.football-data.co.uk/)
epl = pd.read_csv('./data/EPL/2017.csv')
epl.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA,Unnamed: 65,Unnamed: 66,Unnamed: 67
0,E0,11/08/17,Arsenal,Leicester,4.0,3.0,H,2.0,2.0,D,...,1.91,1.85,2.1,2.02,1.49,4.73,7.25,,,
1,E0,12/08/17,Brighton,Man City,0.0,2.0,A,0.0,0.0,D,...,1.95,1.91,2.01,1.96,11.75,6.15,1.29,,,
2,E0,12/08/17,Chelsea,Burnley,2.0,3.0,A,0.0,3.0,A,...,2.03,1.97,1.95,1.9,1.33,5.4,12.25,,,
3,E0,12/08/17,Crystal Palace,Huddersfield,0.0,3.0,A,0.0,2.0,A,...,2.1,2.05,1.86,1.83,1.79,3.56,5.51,,,
4,E0,12/08/17,Everton,Stoke,1.0,0.0,H,1.0,0.0,H,...,1.94,1.9,2.01,1.98,1.82,3.49,5.42,,,


In [57]:
# Let's select only a few columns we are interested in
df2 = epl.loc[: , ['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'FTHG', 'FTAG']]

# Let's rename the columns and set index name as Home Team to do further analysis
df2.columns = ['Date', 'HomeTeam', 'AwayTeam', 'Winner', 'HomeTeamGoals', 'AwayTeamGoals']
#df2 = df2.set_index('HomeTeam')
df2.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,Winner,HomeTeamGoals,AwayTeamGoals
0,11/08/17,Arsenal,Leicester,H,4.0,3.0
1,12/08/17,Brighton,Man City,A,0.0,2.0
2,12/08/17,Chelsea,Burnley,A,2.0,3.0
3,12/08/17,Crystal Palace,Huddersfield,A,0.0,3.0
4,12/08/17,Everton,Stoke,H,1.0,0.0


In [94]:
# Home game dataset
mancity_home = df2.loc[df2['HomeTeam'] == 'Man City']

mancity_home.loc[: , 'Opponent']  = mancity_home['AwayTeam']
mancity_home.loc[: , 'HomeMatch'] = True
mancity_home.loc[: , 'GoalsScored']   = mancity_home['HomeTeamGoals']
mancity_home.loc[: , 'GoalsConceded'] = mancity_home['AwayTeamGoals']

mancity_home.loc[: , 'Result'] = np.where(mancity_home['Winner'] == 'D', 'draw', 
                                  np.where(mancity_home['Winner'] == 'H', 'win', 'loss'))

mancity_home = mancity_home.drop(['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals','Winner'], axis=1)
mancity_home = mancity_home.set_index('Date')


# Away Game dataset
mancity_away = df2.loc[df2['AwayTeam'] == 'Man City']

mancity_away.loc[: , 'Opponent'] = mancity_away['HomeTeam']
mancity_away.loc[: , 'HomeMatch'] = False
mancity_away.loc[: , 'GoalsScored'] = mancity_away['AwayTeamGoals']
mancity_away.loc[: , 'GoalsConceded'] = mancity_away['HomeTeamGoals']

mancity_away.loc[: , 'Result'] = np.where(mancity_away['Winner'] == 'D', 'draw', 
                                  np.where(mancity_away['Winner'] == 'A', 'win', 'loss'))

mancity_away = mancity_away.drop(['HomeTeam', 'AwayTeam', 'HomeTeamGoals', 'AwayTeamGoals','Winner'], axis=1)
mancity_away = mancity_away.set_index('Date')


mancity = pd.concat([mancity_home, mancity_away]) 
mancity.head()

Unnamed: 0_level_0,Opponent,HomeMatch,GoalsScored,GoalsConceded,Result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21/08/17,Everton,True,1.0,1.0,draw
09/09/17,Liverpool,True,5.0,0.0,win
23/09/17,Crystal Palace,True,5.0,0.0,win
14/10/17,Stoke,True,7.0,2.0,win
21/10/17,Burnley,True,3.0,0.0,win


In [97]:
# We can do some intersting stats 
goaldiff = mancity['GoalsScored'].sum() - mancity['GoalsConceded'].sum()
matchesplayed  = mancity['Result'].count()
wins = (mancity['Result'] == 'win').sum()
losses = (mancity['Result'] == 'loss').sum()
draws = (mancity['Result'] == 'draw').sum()

print(" ** Man City Stats for 2017 ** \n Matches Played:{0} \t Matches Won:{1} \t Draws:{2} \t Losses:{3}"
      .format(matchesplayed, wins, draws, losses))

 ** Man City Stats for 2017 ** 
 Matches Played:23 	 Matches Won:20 	 Draws:2 	 Losses:1
