In [None]:
import time, sys

def countdown(t):
    while t:
        mins, secs = divmod(t, 60)
        timeformat = '{:02d}:{:02d}'.format(mins, secs)
        sys.stdout.write('\r' + str(timeformat))
        sys.stdout.flush()
        time.sleep(1)
        t -= 1
    print("Time's Up! \n")

In [None]:
import pandas as pd
from pandas import Series, DataFrame
pd.set_option('max_rows', 15)
pd.set_option('max_columns', 10)
pd.set_option('notebook_repr_html', True)

import numpy as np
%pylab inline

In [None]:
print pd.__version__

### 1 - Import Data with `read_csv()`

In [None]:
pd.read_csv?

In [None]:
# Import the Data
df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv')
df.head()

In [None]:
type(df)

#### Check the dimensions of the imported data with `.shape`

In [7]:
df.shape

(10, 8)

** Check the data types for each column **

In [8]:
df.dtypes

PLAYER     object
SALARY     object
GP        float64
G           int64
A         float64
SOT         int64
PPG       float64
P         float64
dtype: object

In [10]:
df.get_dtype_counts()

float64    4
int64      2
object     2
dtype: int64

---
### 2 - Edit column names with `.rename()`

In [12]:
# Edit column names
df.columns = [x.lower() for x in df.columns]
df.head()

Unnamed: 0,player,salary,gp,g,a,sot,ppg,p
0,Sergio Agüero\n Forward — Manchester City,$19.2m,16.0,14,3.0,34,13.12,209.98
1,Eden Hazard\n Midfield — Chelsea,$18.9m,21.0,8,4.0,17,13.05,274.04
2,Alexis Sánchez\n Forward — Arsenal,$17.6m,,12,7.0,29,11.19,223.86
3,Yaya Touré\n Midfield — Manchester City,$16.6m,18.0,7,1.0,19,10.99,197.91
4,Ángel Di María\n Midfield — Manchester United,$15.0m,13.0,3,,13,10.17,132.23


In [13]:
# Rename specific columns
df = df.rename(columns={'gp': 'gamesPlayed',
               'g': 'goals',
               'a': 'assists',
               'sot': 'shotsOnTarget',
               'ppg': 'pointsPerGame',
               'p': 'points'})
df.columns

Index([u'player', u'salary', u'gamesPlayed', u'goals', u'assists',
       u'shotsOnTarget', u'pointsPerGame', u'points'],
      dtype='object')

---
### 3 - Change values in a column with `.apply()`

In [14]:
df.salary.head()

0    $19.2m
1    $18.9m
2    $17.6m
3    $16.6m
4    $15.0m
Name: salary, dtype: object

In [16]:
df.salary.map(lambda x: x.strip('$m')).head()

0    19.2
1    18.9
2    17.6
3    16.6
4    15.0
Name: salary, dtype: object

In [17]:
# Change entries in a column by row
df.loc[:, 'salary'] = (df.salary
                       .map(lambda x: float(x.strip('$m'))))

> Note:

`.apply()` on a single column has the same effect as using `map()`

---
### 4 - Adding Columns to a DF

In [20]:
print df.player[1]

Eden Hazard
 Midfield — Chelsea


In [None]:
# Derive new columns from the 'player column'

In [29]:
def processPlayerCol(text):
    """
    This function extracts name, position and club from the player column
    """
    name, rest = text.split('\n')
    position, club = [x.strip() for x in rest.split('—')]
    return Series([name, position, club], index=['player', 'position', 'club'])

In [33]:
processPlayerCol(df.player[5])

player      Santiago Cazorla
position            Midfield
club                 Arsenal
dtype: object

In [36]:
df.join(DataFrame(df['player'].map(lambda x: processPlayerCol(x)).tolist()), lsuffix='_l').drop('player_l', axis=1)

Unnamed: 0,salary,gamesPlayed,goals,assists,shotsOnTarget,pointsPerGame,points,player,position,club
0,19.2,16.0,14,3.0,34,13.12,209.98,Sergio Agüero,Forward,Manchester City
1,18.9,21.0,8,4.0,17,13.05,274.04,Eden Hazard,Midfield,Chelsea
2,17.6,,12,7.0,29,11.19,223.86,Alexis Sánchez,Forward,Arsenal
3,16.6,18.0,7,1.0,19,10.99,197.91,Yaya Touré,Midfield,Manchester City
4,15.0,13.0,3,,13,10.17,132.23,Ángel Di María,Midfield,Manchester United
5,14.8,20.0,4,,20,9.97,,Santiago Cazorla,Midfield,Arsenal
6,14.3,15.0,6,2.0,11,10.35,155.26,David Silva,Midfield,Manchester City
7,14.0,20.0,2,14.0,10,10.47,209.49,Cesc Fàbregas,Midfield,Chelsea
8,13.8,21.0,9,0.0,20,7.02,147.43,Saido Berahino,Forward,West Brom
9,13.8,20.0,5,1.0,11,7.5,150.01,Steven Gerrard,Midfield,Liverpool


### Time Comparison for Type Conversion (single values, series) functions

In [None]:
int('12345')

In [None]:
float('12345')

In [None]:
%timeit Series(['123', '456', '789']).astype(int)

In [None]:
%timeit Series(['123', '456', '789']).map(int)

In [None]:
[int(x) for x in ['123', '456', '789']]

In [None]:
%timeit Series([int(x) for x in ['123', '456', '789']])

> Using List Comprehensions is FASTER for element-wise transformations

---
### 5 - Dropping columns from a DF with `.drop(labels, axis)`

In [None]:
df

In [38]:
df['player2'] = df.player.map(lambda x: x.split('\n')[0])
df['club'] = df.player.map(lambda x: x.split('—')[1])
df['position'] = df.player.map(lambda x: x.split('—')[0].split('\n')[1])

In [None]:
df.drop('player', axis=1, inplace=True)

In [None]:
# df = df.drop(labels='player', axis=1)
df.head()    

In [None]:
df.rename(columns={'player2':'player'}, inplace=True)

In [None]:
df.head()

---
### 6 - Applying a function to multiple columns with `applymap()`

In [42]:
# Change the created columns to lowercase
cols = ['player2', 'position', 'club']
df[cols].head()

Unnamed: 0,player2,position,club
0,Sergio Agüero,Forward,Manchester City
1,Eden Hazard,Midfield,Chelsea
2,Alexis Sánchez,Forward,Arsenal
3,Yaya Touré,Midfield,Manchester City
4,Ángel Di María,Midfield,Manchester United


In [43]:
df.loc[:, cols] = df[cols].applymap(lambda x: x.lower())
df[cols].head()

Unnamed: 0,player2,position,club
0,sergio agüero,forward,manchester city
1,eden hazard,midfield,chelsea
2,alexis sánchez,forward,arsenal
3,yaya touré,midfield,manchester city
4,Ángel di maría,midfield,manchester united


In [48]:
num_cols = df.describe().columns.tolist()

In [None]:
print df[num_cols].astype(float)
print df[num_cols].applymap(float)
print df[num_cols].applymap(lambda x: float(x))

---
### 7 - Deal with missing data using `isnull()`, `dropna()` and `fillna()`

#### Remove rows with missing data using `.dropna()`

In [49]:
df

Unnamed: 0,player,salary,gamesPlayed,goals,assists,...,pointsPerGame,points,player2,club,position
0,Sergio Agüero\n Forward — Manchester City,19.2,16.0,14,3.0,...,13.12,209.98,sergio agüero,manchester city,forward
1,Eden Hazard\n Midfield — Chelsea,18.9,21.0,8,4.0,...,13.05,274.04,eden hazard,chelsea,midfield
2,Alexis Sánchez\n Forward — Arsenal,17.6,,12,7.0,...,11.19,223.86,alexis sánchez,arsenal,forward
3,Yaya Touré\n Midfield — Manchester City,16.6,18.0,7,1.0,...,10.99,197.91,yaya touré,manchester city,midfield
4,Ángel Di María\n Midfield — Manchester United,15.0,13.0,3,,...,10.17,132.23,Ángel di maría,manchester united,midfield
5,Santiago Cazorla\n Midfield — Arsenal,14.8,20.0,4,,...,9.97,,santiago cazorla,arsenal,midfield
6,David Silva\n Midfield — Manchester City,14.3,15.0,6,2.0,...,10.35,155.26,david silva,manchester city,midfield
7,Cesc Fàbregas\n Midfield — Chelsea,14.0,20.0,2,14.0,...,10.47,209.49,cesc fàbregas,chelsea,midfield
8,Saido Berahino\n Forward — West Brom,13.8,21.0,9,0.0,...,7.02,147.43,saido berahino,west brom,forward
9,Steven Gerrard\n Midfield — Liverpool,13.8,20.0,5,1.0,...,7.5,150.01,steven gerrard,liverpool,midfield


In [50]:
df.dropna()

Unnamed: 0,player,salary,gamesPlayed,goals,assists,...,pointsPerGame,points,player2,club,position
0,Sergio Agüero\n Forward — Manchester City,19.2,16.0,14,3.0,...,13.12,209.98,sergio agüero,manchester city,forward
1,Eden Hazard\n Midfield — Chelsea,18.9,21.0,8,4.0,...,13.05,274.04,eden hazard,chelsea,midfield
3,Yaya Touré\n Midfield — Manchester City,16.6,18.0,7,1.0,...,10.99,197.91,yaya touré,manchester city,midfield
6,David Silva\n Midfield — Manchester City,14.3,15.0,6,2.0,...,10.35,155.26,david silva,manchester city,midfield
7,Cesc Fàbregas\n Midfield — Chelsea,14.0,20.0,2,14.0,...,10.47,209.49,cesc fàbregas,chelsea,midfield
8,Saido Berahino\n Forward — West Brom,13.8,21.0,9,0.0,...,7.02,147.43,saido berahino,west brom,forward
9,Steven Gerrard\n Midfield — Liverpool,13.8,20.0,5,1.0,...,7.5,150.01,steven gerrard,liverpool,midfield


#### Test % of rows with missing data

In [52]:
print('%d rows have missing values' % (df.shape[0] - df.dropna().shape[0]))

3 rows have missing values


#### Select rows with missing/non-missing data in a specific column 

In [53]:
# On which players do we not have data for assists?
df[df.assists.isnull()]
# also see -- .notnull()

Unnamed: 0,player,salary,gamesPlayed,goals,assists,...,pointsPerGame,points,player2,club,position
4,Ángel Di María\n Midfield — Manchester United,15.0,13.0,3,,...,10.17,132.23,Ángel di maría,manchester united,midfield
5,Santiago Cazorla\n Midfield — Arsenal,14.8,20.0,4,,...,9.97,,santiago cazorla,arsenal,midfield


In [54]:
df[df.assists.notnull()]

Unnamed: 0,player,salary,gamesPlayed,goals,assists,...,pointsPerGame,points,player2,club,position
0,Sergio Agüero\n Forward — Manchester City,19.2,16.0,14,3.0,...,13.12,209.98,sergio agüero,manchester city,forward
1,Eden Hazard\n Midfield — Chelsea,18.9,21.0,8,4.0,...,13.05,274.04,eden hazard,chelsea,midfield
2,Alexis Sánchez\n Forward — Arsenal,17.6,,12,7.0,...,11.19,223.86,alexis sánchez,arsenal,forward
3,Yaya Touré\n Midfield — Manchester City,16.6,18.0,7,1.0,...,10.99,197.91,yaya touré,manchester city,midfield
6,David Silva\n Midfield — Manchester City,14.3,15.0,6,2.0,...,10.35,155.26,david silva,manchester city,midfield
7,Cesc Fàbregas\n Midfield — Chelsea,14.0,20.0,2,14.0,...,10.47,209.49,cesc fàbregas,chelsea,midfield
8,Saido Berahino\n Forward — West Brom,13.8,21.0,9,0.0,...,7.02,147.43,saido berahino,west brom,forward
9,Steven Gerrard\n Midfield — Liverpool,13.8,20.0,5,1.0,...,7.5,150.01,steven gerrard,liverpool,midfield


---
### Impute missing data `.fillna()`

In [55]:
# Impute with Median
df['assists'].fillna(int(df.assists.median()), inplace=True)

# Impute with Mean
df['gamesPlayed'].fillna(int(df.gamesPlayed.mean()), inplace=True)

df[['player', 'assists', 'gamesPlayed']]

Unnamed: 0,player,assists,gamesPlayed
0,Sergio Agüero\n Forward — Manchester City,3.0,16.0
1,Eden Hazard\n Midfield — Chelsea,4.0,21.0
2,Alexis Sánchez\n Forward — Arsenal,7.0,18.0
3,Yaya Touré\n Midfield — Manchester City,1.0,18.0
4,Ángel Di María\n Midfield — Manchester United,2.0,13.0
5,Santiago Cazorla\n Midfield — Arsenal,2.0,20.0
6,David Silva\n Midfield — Manchester City,2.0,15.0
7,Cesc Fàbregas\n Midfield — Chelsea,14.0,20.0
8,Saido Berahino\n Forward — West Brom,0.0,21.0
9,Steven Gerrard\n Midfield — Liverpool,1.0,20.0


---
### 8 - Sorting using `.sort()` or `sort_index()`

In [56]:
df.columns.values

array(['player', 'salary', 'gamesPlayed', 'goals', 'assists',
       'shotsOnTarget', 'pointsPerGame', 'points', 'player2', 'club',
       'position'], dtype=object)

In [57]:
# Find the top 3 goalscorers
df.sort_values(by=['goals'], ascending=False).loc[:, ['player2', 'goals']].head(3)

Unnamed: 0,player2,goals
0,sergio agüero,14
2,alexis sánchez,12
8,saido berahino,9


In [None]:
# Drop old index, create new
df.reset_index().drop('index', axis=1)

In [None]:
(df
 .reset_index()
 .drop('index', axis=1)
 .set_index('player'))

### 9 - Modifying the index `set_index()` and `reindex()`

In [None]:
df.set_index('player', inplace=True)

In [None]:
df[:3]

In [None]:
df.reset_index(inplace=True)

In [None]:
df[:3]

---
### 10 - Subsetting data using Booleans

In [58]:
df['club'] = df.club.map(lambda x: x.strip())
df['position'] = df.position.map(lambda x: x.strip())

In [60]:
df.loc[(df['club'] == 'arsenal') | (df['club'] == 'chelsea'), ['player2', 'club']]

Unnamed: 0,player2,club
1,eden hazard,chelsea
2,alexis sánchez,arsenal
5,santiago cazorla,arsenal
7,cesc fàbregas,chelsea


In [61]:
# Alternate way of selecting only Arsenal and Chelsea players
df[df.club.isin(['arsenal', 'chelsea'])].loc[:, ['player2', 'club']]

Unnamed: 0,player2,club
1,eden hazard,chelsea
2,alexis sánchez,arsenal
5,santiago cazorla,arsenal
7,cesc fàbregas,chelsea


In [62]:
df[[x in ['arsenal', 'chelsea'] for x in df.club]].loc[:, ['player2', 'club']]

Unnamed: 0,player2,club
1,eden hazard,chelsea
2,alexis sánchez,arsenal
5,santiago cazorla,arsenal
7,cesc fàbregas,chelsea


In [63]:
df.query("club in ('arsenal', 'chelsea')").loc[:, ['player2', 'club']]

Unnamed: 0,player2,club
1,eden hazard,chelsea
2,alexis sánchez,arsenal
5,santiago cazorla,arsenal
7,cesc fàbregas,chelsea


In [68]:
# Selecting forwards from Arsenal only
df[ (df['club'] == 'arsenal') & (df['position'] == 'forward')].loc[:, ['player2', 'position']]

Unnamed: 0,player2,position
2,alexis sánchez,forward


### 11 - GroupBy operations

- `agg, apply`

In [69]:
# For each position, find mean and sum of goals and assists

print (df
 .groupby('position')[['goals', 'assists']]
 .agg(['sum', 'mean'])
 .stack()
)

                   goals    assists
position                           
forward  sum   35.000000  10.000000
         mean  11.666667   3.333333
midfield sum   35.000000  26.000000
         mean   5.000000   3.714286
