In [None]:
import time, sys

def countdown(t):
    while t:
        mins, secs = divmod(t, 60)
        timeformat = '{:02d}:{:02d}'.format(mins, secs)
        sys.stdout.write('\r' + str(timeformat))
        sys.stdout.flush()
        time.sleep(1)
        t -= 1
    print("Time's Up! \n")

In [None]:
import pandas as pd
from pandas import Series, DataFrame
pd.set_option('max_rows', 15)
pd.set_option('max_columns', 10)
pd.set_option('notebook_repr_html', True)

import numpy as np
%pylab inline

In [None]:
print pd.__version__

### 1 - Import Data with `read_csv()`

In [None]:
pd.read_csv?

In [None]:
# Import the Data
df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv')
df.head()

In [None]:
type(df)

#### Check the dimensions of the imported data with `.shape`

In [None]:
df.shape

** Check the data types for each column **

In [None]:
df.dtypes

In [None]:
df.columns.to_series().groupby(df.dtypes).groups

---
### 2 - Edit column names with `.rename()`

In [None]:
# Edit column names
df.columns = [x.lower() for x in df.columns]
df.columns.values

In [None]:
# Rename specific columns
df = df.rename(columns={'gp': 'gamesPlayed',
               'g': 'goals',
               'a': 'assists',
               'sot': 'shotsOnTarget',
               'ppg': 'pointsPerGame',
               'p': 'points'})
df.columns

---
### 3 - Change values in a column with `.apply()`

In [None]:
df.salary.head()

In [None]:
df.salary.map(lambda x: x.strip('$m')).head()

In [None]:
# Change entries in a column by row
df.salary = (df
             .loc[:, 'salary']
             .map(lambda x: x.strip('$m'))
             .astype(float))

In [None]:
df

> Note:

`.apply()` on a single column has the same effect as using `map()`

---
### 4 - Adding Columns to a DF

In [None]:
df.player[1]

In [None]:
df.player[1].split('\n')

In [None]:
# Derive new columns from the 'player column'

In [None]:
def processPlayerCol(text):
    """
    This function extracts name, position and club from the player column
    """
    name, rest = text.split('\n')
    position, club = [x.strip() for x in rest.split('—')]
    return Series([name, position, club], index=['player', 'position', 'club'])

In [None]:
processPlayerCol(df.player[9])

In [None]:
df.join(DataFrame(df['player'].map(lambda x: processPlayerCol(x)).tolist()), lsuffix='_l').drop('player_l', axis=1)

### Time Comparison for Type Conversion (single values, series) functions

In [None]:
int('12345')

In [None]:
float('12345')

In [None]:
%timeit Series(['123', '456', '789']).astype(int)

In [None]:
%timeit Series(['123', '456', '789']).map(int)

In [None]:
[int(x) for x in ['123', '456', '789']]

In [None]:
%timeit Series([int(x) for x in ['123', '456', '789']])

> Using List Comprehensions is FASTER for element-wise transformations

---
### 5 - Dropping columns from a DF with `.drop(labels, axis)`

In [None]:
df

In [None]:
df['player2'] = df.player.map(lambda x: x.split('\n')[0])
df['club'] = df.player.map(lambda x: x.split('—')[1])
df['position'] = df.player.map(lambda x: x.split('—')[0].split('\n')[1])

In [None]:
df[:3]

In [None]:
df.drop('player', axis=1, inplace=True)

In [None]:
# df = df.drop(labels='player', axis=1)
df.head()    

In [None]:
df.rename(columns={'player2':'player'}, inplace=True)

In [None]:
df.head()

---
### 6 - Applying a function to multiple columns with `applymap()`

In [None]:
# Change the created columns to lowercase
cols = ['player', 'position', 'club']
df[cols].head()

In [None]:
df[cols] = df[cols].applymap(lambda x: x.lower())
df[cols].head()

In [None]:
num_cols = df.describe().columns.tolist()

In [None]:
print df[num_cols].astype(float)
print df[num_cols].applymap(float)
print df[num_cols].applymap(lambda x: float(x))

---
### 7 - Deal with missing data using `isnull()`, `dropna()` and `fillna()`

#### Remove rows with missing data using `.dropna()`

In [None]:
df

In [None]:
df.dropna()

#### Test % of rows with missing data

In [None]:
print('%d rows have missing values' % (df.shape[0] - df.dropna().shape[0]))

#### Select rows with missing/non-missing data in a specific column 

In [None]:
# On which players do we not have data for assists?
df[df.assists.isnull()]
# also see -- .notnull()

In [None]:
df[df.assists.notnull()]

---
### Impute missing data `.fillna()`

In [None]:
# Impute with Median
df['assists'].fillna(int(df.assists.median()), inplace=True)

# Impute with Mean
df['gamesPlayed'].fillna(int(df.gamesPlayed.mean()), inplace=True)

df[['player', 'assists', 'gamesPlayed']]

---
### 8 - Sorting using `.sort()` or `sort_index()`

In [None]:
df.columns.values

In [None]:
# Find the top 3 goalscorers
df.sort_values(by=['goals'], ascending=False, inplace=True)
df[['player', 'goals']].head(3)

In [None]:
df

In [None]:
# Drop old index, create new
df.reset_index().drop('index', axis=1)

In [None]:
(df
 .reset_index()
 .drop('index', axis=1)
 .set_index('player'))

### 9 - Modifying the index `set_index()` and `reindex()`

In [None]:
df.set_index('player', inplace=True)

In [None]:
df[:3]

In [None]:
df.reset_index(inplace=True)

In [None]:
df[:3]

---
### 10 - Subsetting data using Booleans

In [None]:
df['club'] = df.club.map(lambda x: x.strip())
df['position'] = df.position.map(lambda x: x.strip())

In [None]:
%timeit df[(df['club'] == 'arsenal') | (df['club'] == 'chelsea') ]

In [None]:
# Alternate way of selecting only Arsenal and Chelsea players
%timeit df[df.club.isin(['arsenal', 'chelsea'])]

In [None]:
%timeit df[[x in ['arsenal', 'chelsea'] for x in df.club]]

In [None]:
%timeit df.query("club in ('arsenal', 'chelsea')")

In [None]:
# Selecting forwards from Arsenal only
df[ (df['club'] == 'manchester city') & (df['position'] == 'forward') ]

### 11 - GroupBy operations

- `agg, apply`

In [None]:
# For each position, find mean and sum of goals and assists

print (df
 .groupby('position')[['goals', 'assists']]
 .agg(['sum', 'mean'])
 .stack()
)