In [1]:
import sys
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'a': [x for x in range(0,10000000)], 'b': [y for y in range(10000000,20000000)], 'c': [z for z in range(20000000,30000000)]})

In [3]:
df.head()

Unnamed: 0,a,b,c
0,0,10000000,20000000
1,1,10000001,20000001
2,2,10000002,20000002
3,3,10000003,20000003
4,4,10000004,20000004


## memory tests

In [8]:
sys.getsizeof(df) / 1000000

240.000144

In [9]:
np_df = df.to_numpy()
sys.getsizeof(np_df) / 1000000

0.000128

In [10]:
dict_df = df.to_dict('records')
sys.getsizeof(dict_df) / 1000000

89.09516

In [4]:
from datetime import datetime

# part1 - multiplication and division

## to_numpy - get column by index only

In [5]:
time_1 = datetime.now()
for idx, row in enumerate(df.to_numpy()):
    df.at[idx, 'f_new'] = (row[0] * row[1] / row[2])
time_2 = datetime.now()
print(f"Iterating with to_numpy took {time_2 - time_1} seconds")

Iterating with to_numpy took 0:00:57.187218 seconds


## to_numpy - use column index dictionary to get columns by their names

In [8]:
cols_index_dict = {col_name: df.columns.get_loc(col_name) for col_name in df.columns}
cols_index_dict

{'a': 0, 'b': 1, 'c': 2, 'f_new': 3}

In [9]:
df.head()

Unnamed: 0,a,b,c,f_new
0,0,10000000,20000000,0.0
1,1,10000001,20000001,0.5
2,2,10000002,20000002,1.0
3,3,10000003,20000003,1.5
4,4,10000004,20000004,2.0


In [10]:
time_1 = datetime.now()
for idx, row in enumerate(df.to_numpy()):
    df.at[idx, 'f_new_1'] = (row[cols_index_dict['a']] * row[cols_index_dict['b']] / row[cols_index_dict['c']])
time_2 = datetime.now()
print(f"Iterating with to_numpy took {time_2 - time_1} seconds")

Iterating with to_numpy took 0:00:58.324287 seconds


In [11]:
df.head()

Unnamed: 0,a,b,c,f_new,f_new_1
0,0,10000000,20000000,0.0,0.0
1,1,10000001,20000001,0.5,0.5
2,2,10000002,20000002,1.0,1.0
3,3,10000003,20000003,1.5,1.5
4,4,10000004,20000004,2.0,2.0


## itertuples - get column by index only

In [8]:
time_1 = datetime.now()
for row in df.itertuples(index=True):
    df.at[row[0], 'h_new'] = (row[1] * row[2] / row[3])
time_2 = datetime.now()
print(f"Iterating with itertuples took {time_2 - time_1} seconds")

Iterating with itertuples took 0:01:03.553333 seconds


## to_dict

In [5]:
time_1 = datetime.now()
for idx, row in enumerate(df.to_dict('records',)):
    df.at[idx, 'd_new'] = (row['a'] * row['b'] / row['c'])
time_2 = datetime.now()
print(f"Iterating with to_dict took {time_2 - time_1} seconds")

Iterating with to_dict took 0:01:15.927130 seconds


## to_records - get column by index only

In [6]:
time_1 = datetime.now()
for row in df.to_records():
    df.at[row[0], 'e_new'] = (row[1] * row[2] / row[3])
time_2 = datetime.now()
print(f"Iterating with to_records took {time_2 - time_1} seconds")

Iterating with to_records took 0:01:38.527920 seconds


## apply

In [22]:
time_1 = datetime.now()
df['g_new'] = df.apply(lambda row: row['a'] * row['b'] / row['c'], axis=1)
time_2 = datetime.now()
print(f"Iterating with apply took {time_2 - time_1} seconds")

Iterating with apply took 0:01:46.783344 seconds


## iterrows

In [9]:
# time_1 = datetime.now()
# for idx, row in df.iterrows():
#     df.at[idx, 'i_new'] = (row['a'] * row['b'] / row['c'])
# time_2 = datetime.now()
# print(f"Iterating with iterrows took {time_2 - time_1} seconds")

## Vectorized

In [10]:
time_1 = datetime.now()
df['j_new'] = df['a'] * df['b'] / df['c']
time_2 = datetime.now()
print(f"Vectorized operation took {time_2 - time_1} seconds")

Vectorized operation took 0:00:00.086232 seconds


# String operations

In [13]:
string_df = pd.DataFrame({'a': ['a' for x in range(0,10000000)], 'b': ['b' for y in range(0,10000000)], 'c': ['c' for z in range(0,10000000)]})

In [15]:
time_1 = datetime.now()
for idx, row in enumerate(string_df.to_numpy()):
    string_df.at[idx, 'f_new'] = (row[0] + row[1] + row[2])
time_2 = datetime.now()
print(f"Iterating with to_numpy took {time_2 - time_1} seconds")

Iterating with to_numpy took 0:00:48.804319 seconds


In [20]:
time_1 = datetime.now()
for row in string_df.itertuples(index=True):
    string_df.at[row[0], 'h_new'] = (row[1] + row[2] + row[3])
time_2 = datetime.now()
print(f"Iterating with itertuples took {time_2 - time_1} seconds")

Iterating with itertuples took 0:00:58.220289 seconds


In [16]:
time_1 = datetime.now()
for idx, row in enumerate(string_df.to_dict('records')):
    string_df.at[idx, 'd_new'] = (row['a'] + row['b'] + row['c'])
time_2 = datetime.now()
print(f"Iterating with to_dict took {time_2 - time_1} seconds")

Iterating with to_dict took 0:01:11.325483 seconds


In [18]:
time_1 = datetime.now()
for row in string_df.to_records():
    string_df.at[row[0], 'e_new'] = (row[1] + row[2] + row[3])
time_2 = datetime.now()
print(f"Iterating with to_records took {time_2 - time_1} seconds")

Iterating with to_records took 0:01:26.212519 seconds


In [19]:
time_1 = datetime.now()
string_df['g_new'] = string_df.apply(lambda row: row['a'] + row['b'] + row['c'], axis=1)
time_2 = datetime.now()
print(f"Iterating with apply took {time_2 - time_1} seconds")

Iterating with apply took 0:01:40.220461 seconds


# Groupby tests

In [17]:
import numpy as np
import pandas as pd
from datetime import datetime

In [28]:
arr = np.array(['a', 'b', 'c'] * 10000000)

In [29]:
len(arr)

30000000

In [39]:
group_df = pd.DataFrame({'str_col': arr, 'a': [x for x in range(30000000)], 'b': [0.5*x for x in range(30000000)],
                        'c': [2*x for x in range(30000000)], 'd': [1.5*x for x in range(30000000)],
                        'e': [3*x for x in range(30000000)], 'f': [0.8*x for x in range(30000000)],
                        'g': [0.5*x for x in range(30000000)], 'h': [2.1*x for x in range(30000000)]})

In [41]:
time_1 = datetime.now()
group_df.groupby('str_col')['b'].sum()
time_2 = datetime.now()
print(f"Grouping on one column took {time_2 - time_1} seconds")

Grouping on one column took 0:00:01.093989 seconds


In [40]:
time_1 = datetime.now()
group_df.groupby('str_col').sum()['b']
time_2 = datetime.now()
print(f"Grouping on all columns took {time_2 - time_1} seconds")

Grouping on all columns took 0:00:02.215790 seconds


# MultiProcessing tests

In [15]:
import psutil

In [16]:
psutil.cpu_count(logical=False)

6

In [17]:
psutil.cpu_count(logical=True)

12