# Chapter 3

In [10]:
import numpy as np
import pandas as pd
from urllib.request import urlretrieve
from io import StringIO
import time

## .iterrows()
Don't do this! Think vectorized!

In [14]:
# Hmm - modern versions of pandas can read csv from a url aswell as local file
# very handy - I can skip the urlretreive 
poker_hands = pd.read_csv('http://assets.datacamp.com/production/repositories/3832/datasets/c715cfae17d00d26693da8e612cb5fbd64e69589/poker_hand.csv')

In [16]:
poker_hands.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25010 entries, 0 to 25009
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   S1      25010 non-null  int64
 1   R1      25010 non-null  int64
 2   S2      25010 non-null  int64
 3   R2      25010 non-null  int64
 4   S3      25010 non-null  int64
 5   R3      25010 non-null  int64
 6   S4      25010 non-null  int64
 7   R4      25010 non-null  int64
 8   S5      25010 non-null  int64
 9   R5      25010 non-null  int64
 10  Class   25010 non-null  int64
dtypes: int64(11)
memory usage: 2.1 MB


In [17]:
# Create a generator over the rows
generator = poker_hands.iterrows()

# Access the elements of the 2nd row
first_element = next(generator)
second_element = next(generator)
print(first_element, second_element)

(0, S1        1
R1       10
S2        1
R2       11
S3        1
R3       13
S4        1
R4       12
S5        1
R5        1
Class     9
Name: 0, dtype: int64) (1, S1        2
R1       11
S2        2
R2       13
S3        2
R3       10
S4        2
R4       12
S5        2
R5        1
Class     9
Name: 1, dtype: int64)


In [22]:
data_generator = poker_hands.iterrows()

for index, values in data_generator:
  	# Check if index is odd
    if index % 2 != 0:
      	# Sum the ranks of all the cards
        hand_sum = sum([values[1], values[3], values[5], values[7], values[9]])

## Using .apply()
Much better... although not row wise! Column wise is the path to speed

In [25]:
# Define the lambda transformation
get_square = lambda x: x**2

# Apply the transformation
data_sum = poker_hands.apply(get_square)
print(data_sum.head())

   S1   R1  S2   R2  S3   R3  S4   R4  S5   R5  Class
0   1  100   1  121   1  169   1  144   1    1     81
1   4  121   4  169   4  100   4  144   4    1     81
2   9  144   9  121   9  169   9  100   9    1     81
3  16  100  16  121  16    1  16  169  16  144     81
4  16    1  16  169  16  144  16  121  16  100     81


In [35]:
# Define the lambda transformation
get_variance = lambda x: np.var(x)

# Apply the transformation
data_tr = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].apply(get_variance, axis=1)
print(data_tr.head())

0    18.64
1    18.64
2    18.64
3    18.64
4    18.64
5     2.00
6     2.00
7     2.00
8     2.00
9     2.00
dtype: float64


In [36]:
# Define the lambda transformation
get_variance = lambda x: np.var(x)

# Apply the transformation
data_tr = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].apply(get_variance)
print(data_tr.head())

R1    14.060473
R2    14.189523
R3    14.024270
R4    14.040552
R5    13.998851
dtype: float64


In [37]:
# Calculate the mean rank in each hand
row_start_time = time.time()
mean_r = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].mean(axis=1)
print("Time using pandas vectorization for rows: {} sec".format(time.time() - row_start_time))
print(mean_r.head())

# Calculate the mean rank of each of the 5 card in all hands
col_start_time = time.time()
mean_c = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].mean(axis=0)
print("Time using pandas vectorization for columns: {} sec".format(time.time() - col_start_time))
print(mean_c.head())

Time using pandas vectorization for rows: 0.001825571060180664 sec
0    9.4
1    9.4
2    9.4
3    9.4
4    9.4
dtype: float64
Time using pandas vectorization for columns: 0.0016279220581054688 sec
R1    6.995242
R2    7.014194
R3    7.014154
R4    6.942463
R5    6.962735
dtype: float64


In [39]:
# Calculate the variance in each hand
start_time = time.time()
poker_var = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].var(axis=1)
print("Time using pandas vectorization: {} sec".format(time.time() - start_time))
print(poker_var.head())

Time using pandas vectorization: 0.0022211074829101562 sec
0    23.3
1    23.3
2    23.3
3    23.3
4    23.3
dtype: float64


In [40]:
# Calculate the variance in each hand
start_time = time.time()
poker_var = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']].values.var(axis=1, ddof=1)
print("Time using NumPy vectorization: {} sec".format(time.time() - start_time))
print(poker_var[0:5])

Time using NumPy vectorization: 0.002164125442504883 sec
[23.3 23.3 23.3 23.3 23.3]
