## Подготовка

In [4]:
%load_ext memory_profiler

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


### Вводные данные:

* замеры производятся на файле размером 1,2 Гб
* замеряются библиотеки Pandas, Modin, Samwise

# Анализ потребляемой памяти

# Создание dataframe, обращение по индексу

### Pandas

In [5]:
%%file memory.py
import pandas as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['column1']
    col2 = df['duration_ms']
    
test()

Overwriting memory.py


In [6]:
!python3 -m memory_profiler memory.py

Filename: memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     3   79.391 MiB   79.391 MiB           1   @profile
     4                                         def test():
     5 1017.125 MiB  937.734 MiB           1       df = pd.read_csv('newdata.csv')
     6 1017.250 MiB    0.125 MiB           1       col1 = df['popularity']
     7                                             col2 = df['duration_ms']


Traceback (most recent call last):
  File "/Users/ancharts/Library/Python/3.8/lib/python/site-packages/pandas/core/indexes/base.py", line 3361, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 76, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5206, in pand

### Modin

In [None]:
%%file memory.py
import modin.pandas as pd
from distributed import Client
client = Client()

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### Samwise

In [None]:
%%file memory.py
import samwise as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    
test()

In [None]:
!python3 -m memory_profiler memory.py

## Корреляция методом Пирсона 

### Pandas

In [None]:
%%file memory.py
import pandas as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'pearson')
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### Modin

In [None]:
%%file memory.py
import modin.pandas as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'pearson')
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### Samwise 

In [None]:
%%file memory.py
import samwise as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'pearson')
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### SciPy with Pandas

In [None]:
%%file memory.py
import pandas as pd
from scipy.stats import pearsonr
import numpy as np

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    pearsonr(col1, col2)
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### SciPy with Modin

In [None]:
%%file memory.py
import modin.pandas as pd
from scipy.stats import pearsonr
from distributed import Client
client = Client()

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    pearsonr(col1, col2)
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### SciPy with Samwise

In [None]:
%%file memory.py
import samwise as pd
from scipy.stats import pearsonr


@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    pearsonr(col1, col2)
    
    
test()

In [None]:
!python3 -m memory_profiler memory.py

## Корреляция методом Спирмана

### Pandas

In [None]:
%%file memory.py
import pandas as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'spearman')
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### Samwise 

In [None]:
%%file memory.py
import samwise as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'spearman')
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### SciPy with Pandas

In [None]:
%%file memory.py
import pandas as pd
from scipy.stats import spearmanr

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    spearmanr(col1, col2)
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### SciPy with Samwise

In [None]:
%%file memory.py
import samwise as pd
from scipy.stats import spearmanr

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    spearmanr(col1, col2)
    
    
test()

In [None]:
!python3 -m memory_profiler memory.py

## Ковариация

### NumPy with Pandas

In [None]:
%%file memory.py
import pandas as pd
from numpy import cov

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    cov(col1, col2)
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### NumPy with Samwise

In [None]:
%%file memory.py
import samwise as pd
from numpy import cov


@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    cov(col1, col2)
    
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### NumPy with Modin

In [None]:
%%file memory.py
import modin.pandas as pd
from numpy import cov


@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    cov(col1, col2)
    
    
test()

In [None]:
!python3 -m memory_profiler memory.py

## Минимум, максимум, среднее, сумма на столбце

### Pandas

In [None]:
%%file memory.py
import pandas as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.min()
    col1.max()
    col1.mean()
    col1.sum()
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### Modin

In [None]:
%%file memory.py
import modin.pandas as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.min()
    col1.max()
    col1.mean()
    col1.sum()
    
test()

In [None]:
!python3 -m memory_profiler memory.py

### Samwise

In [None]:
%%file memory.py
import samwise as pd

@profile
def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.min()
    col1.max()
    col1.mean()
    col1.sum()
    
    
test()

In [None]:
!python3 -m memory_profiler memory.py

## Линейная регрессия

### Pandas with NumPy

In [None]:
%%file memory.py
import pandas as pd
from sklearn.linear_model import LinearRegression

@profile
def test():
    df = pd.read_csv('newdata.csv')
    X = df[['popularity', 'duration_ms']]
    y = df['tempo']
    reg = LinearRegression().fit(X, y)
    

test()

In [None]:
!python3 -m memory_profiler memory.py

### Modin with NumPy

In [None]:
%%file memory.py
import modin.pandas as pd
from sklearn.linear_model import LinearRegression

@profile
def test():
    df = pd.read_csv('newdata.csv')
    X = df[['popularity', 'duration_ms']]
    y = df['tempo']
    reg = LinearRegression().fit(X, y)

test()

In [None]:
!python3 -m memory_profiler memory.py

### Samwise with NumPy

In [None]:
%%file memory.py
import samwise as pd
from sklearn.linear_model import LinearRegression

@profile
def test():
    df = pd.read_csv('newdata.csv')
    X = df[['popularity', 'duration_ms']]
    y = df['tempo']
    reg = LinearRegression().fit(X, y)

test()

In [None]:
!python3 -m memory_profiler memory.py

# Анализ времени работы

# Создание dataframe, обращение по индексу

### Pandas

In [None]:
import time
import pandas as pd

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    
start_time = time.time()
test()
pd_time = time.time() - start_time
print("%s seconds" % pd_time)

### Modin

In [None]:
import modin.pandas as pd
from distributed import Client
client = Client()

def test():
    df = pd.read_csv('newdata.csv')
    co1 = df['popularity']
    
start_time = time.time()
test()
md_time = time.time() - start_time
print("%s seconds" % md_time)

### Samwise

In [None]:
import samwise as pd

def test():
    df = pd.read_csv('newdata.csv')
    co1 = df['popularity']
    
start_time = time.time()
test()
sw_time = time.time() - start_time
print("%s seconds" % sw_time)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = [pd_time, md_time, sw_time]
z1 = ['pd', 'md', 'sw']

# bar()
fig = plt.figure()
plt.bar(z1, x)
plt.title('DataFrame and __getitem__ time')
plt.grid(True)
plt.show()

## Корреляция методом Пирсона 

### Pandas

In [None]:
import pandas as pd
import time

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'pearson')
    
start_time = time.time()
test()
pd_time = time.time() - start_time
print("%s seconds" % pd_time)

### Modin

In [None]:
import modin.pandas as pd
import time
from distributed import Client
client = Client()

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'pearson')
    
start_time = time.time()
test()
md_time = time.time() - start_time
print("%s seconds" % md_time)

### Samwise 

In [None]:
import samwise as pd

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'pearson')
    
start_time = time.time()
test()
sw_time = time.time() - start_time
print("%s seconds" % sw_time)

### SciPy with Pandas

In [None]:
import pandas as pd
from scipy.stats import pearsonr

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    pearsonr(col1, col2)
    
start_time = time.time()
test()
pd_scipy_time = time.time() - start_time
print("%s seconds" % pd_scipy_time)

### SciPy with Modin

In [None]:
import modin.pandas as pd
from scipy.stats import pearsonr

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    pearsonr(col1, col2)
    
start_time = time.time()
test()
md_scipy_time = time.time() - start_time
print("%s seconds" % md_scipy_time)

### SciPy with Samwise

In [None]:
import samwise as pd
from scipy.stats import pearsonr

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    pearsonr(col1, col2)
    
start_time = time.time()
test()
sw_scipy_time = time.time() - start_time
print("%s seconds" % sw_scipy_time)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = [pd_time, md_time, sw_time, pd_scipy_time, md_scipy_time, sw_scipy_time]
z1 = ['pd', 'md', 'sw', 'pd scipy', 'md scipy', 'sw scipy']

# bar()
fig = plt.figure(figsize=(8, 6), dpi=80)
plt.bar(z1, x)
plt.title('Pearson correlation time')
plt.grid(True)
plt.show()

## Корреляция методом Спирмана

### Pandas

In [None]:
import pandas as pd

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    col1.corr(col2, method = 'spearman')
    
start_time = time.time()
test()
pd_time = time.time() - start_time
print("%s seconds" % pd_time)

### Samwise 

In [None]:
import samwise as pd

def test():
    df = pd.read_csv('newdata.csv')
    df.spearman('popularity', 'duration_ms')
    
start_time = time.time()
test()
sw_time = time.time() - start_time
print("%s seconds" % sw_time)

### SciPy with Pandas

In [None]:
import pandas as pd
from scipy.stats import spearmanr

def test():   
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    spearmanr(col1, col2)
    
start_time = time.time()
test()
pd_scipy_time = time.time() - start_time
print("%s seconds" % pd_scipy_time)

### SciPy with Samwise

In [None]:
import samwise as pd
from scipy.stats import spearmanr

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    spearmanr(col1, col2)

start_time = time.time()
test()
sw_scipy_time = time.time() - start_time
print("%s seconds" % sw_scipy_time)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = [pd_time, sw_time, pd_scipy_time, sw_scipy_time]
z1 = ['pd', 'sw', 'pd scipy', 'sw scipy']

# bar()
fig = plt.figure(figsize=(8, 6), dpi=80)
plt.bar(z1, x)
plt.title('Spearman correlation time')
plt.grid(True)
plt.show()

## Ковариация

### NumPy with Pandas

In [None]:
import pandas as pd
from numpy import cov

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    cov(col1, col2)
    
start_time = time.time()
test()
pd_time = time.time() - start_time
print("%s seconds" % pd_time)

### NumPy with Modin

In [None]:
import modin.pandas as pd
from numpy import cov

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    cov(col1, col2)
    
start_time = time.time()
test()
md_time = time.time() - start_time
print("%s seconds" % md_time)

### NumPy with Samwise

In [None]:
import samwise as pd
from numpy import cov

def test():
    df = pd.read_csv('newdata.csv')
    col1 = df['popularity']
    col2 = df['duration_ms']
    cov(col1, col2) 
    
start_time = time.time()
test()
sw_time = time.time() - start_time
print("%s seconds" % sw_time)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = [pd_time, md_time, sw_time]
z1 = ['pd', 'md', 'sw']

# bar()
fig = plt.figure(figsize=(5, 6), dpi=80)
plt.bar(z1, x)
plt.title('DataFrame and __getitem__ time')
plt.grid(True)
plt.show()

## Минимум, максимум, среднее, сумма на столбце

### Pandas

In [None]:
import pandas as pd

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.min()
pd_min = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.max()
pd_max = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.sum()
pd_sum = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.mean()
pd_mean = time.time() - start_time

### Modin

In [None]:
import modin.pandas as pd

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.min()
md_min = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.max()
md_max = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.sum()
md_sum = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.mean()
md_mean = time.time() - start_time

### Samwise

In [None]:
import samwise as pd

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.min()
sw_min = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.max()
sw_max = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.sum()
sw_sum = time.time() - start_time

start_time = time.time()
df = pd.read_csv('newdata.csv')
col1 = df['popularity']
col1.mean()
sw_mean = time.time() - start_time

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = [pd_min, md_min, sw_min, pd_max, md_max, sw_max, pd_sum, md_sum, sw_sum, pd_mean, md_mean, sw_mean]
z1 = ['pd min', 'md min', 'sw min', 'pd max', 'md max', 'sw max', 'pd sum', 'md sum', 'sw sum', 'pd mean', 'md mean', 'sw mean']

print(pd_min, md_min, sw_min)

# bar()
fig = plt.figure(figsize=(12, 6), dpi=80)
plt.bar(z1, x)
plt.title('Min, max, sum, mean time')
plt.grid(True)
plt.show()

## Линейная регрессия

### Pandas with NumPy

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

def test():
    df = pd.read_csv('newdata.csv')
    X = df[['popularity', 'duration_ms']]
    y = df['tempo']
    reg = LinearRegression().fit(X, y)
    

start_time = time.time()
test()
pd_time = time.time() - start_time
print("%s seconds" % pd_time)

### Modin with NumPy

In [None]:
import modin.pandas as pd
from sklearn.linear_model import LinearRegression

def test():
    df = pd.read_csv('newdata.csv')
    X = df[['popularity', 'duration_ms']]
    y = df['tempo']
    reg = LinearRegression().fit(X, y)
    

start_time = time.time()
test()
md_time = time.time() - start_time
print("%s seconds" % md_time)

### Samwise with NumPy

In [None]:
import samwise as pd
from sklearn.linear_model import LinearRegression

def test():
    df = pd.read_csv('newdata.csv')
    X = df[['popularity', 'duration_ms']]
    y = df['tempo']
    reg = LinearRegression().fit(X, y)

start_time = time.time()
test()
sw_time = time.time() - start_time
print("%s seconds" % sw_time)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

x = [pd_time, md_time, sw_time]
z1 = ['pd', 'md', 'sw']

# bar()
fig = plt.figure()
plt.bar(z1, x)
plt.title('Linear regression time')
plt.grid(True)
plt.show()