In [None]:
pip install guppy3

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

### 1. Configuring setting and importing libraries

* When starting a Jupyter notebook, it's considered good practice to configure settings such as plot sizes and display formats at the beginning. 
  * This only makes sense if settings are used throughout the notebook, not to modify a single plot

* This practice helps maintain consistency throughout the notebook and ensures that all visualizations and outputs are standardizes across notebooks



In [None]:
example_df = pd.DataFrame({"A": np.arange(100), "B": np.linspace(0, 55, 100)})

example_df.head()


In [None]:
# Modify the column B so it only contains two decimal points
example_df = pd.DataFrame({"A": np.arange(100), "B": np.linspace(0, 55, 100)})
example_df['B'] = example_df['B'].round(2)
example_df.head()

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
example_df = pd.DataFrame({"A": np.arange(100), "B": np.linspace(0, 55, 100)})
example_df.head()

In [None]:
plt.plot(example_df['B'])

In [None]:
plt.figure(figsize=(14,4))
plt.plot(example_df['B'])

In [None]:
plt.rcParams

In [None]:
plt.rcParams["figure.figsize"] = (14,4)

In [None]:
plt.plot(example_df['B'])

In [None]:
pd.options.display.float_format = None

In [None]:
df = pd.DataFrame({
    'Large Numbers': [100000000000000000000000000.0]
})
df

In [None]:
pd.options.display.max_rows = 5

In [None]:
example_df

In [None]:
# You can also set the display format to suppress scientific notation for floating-point numbers
pd.options.display.float_format = '{:.2f}'.format

In [None]:
df


### 2. Avoiding Unintended References

* A view in pandas is a subset of the original object ( DataFrame or Series ) linked to the original source. 
 * When you create a view, you are not creating a new copy of the data, but rather a reference to the original data. 
 * Changes you make to the view will be reflected in the original data, and vice versa.

* When you work directly with views on the data, instead of simple column assignment, you may end up with a reference to the original data rather than a copy. 
* Also, many of you have experienced the "SettingWithCopyWarning." 
  *  triggered when you try to modify a DataFrame that might be a view on another DataFrame
   * Pandas cannot definitively tell which. 

* If you want to ensure that your modifications do not affect the original DataFrame when working with slices or subsets, you should explicitly create a copy using the .copy() method.

In [None]:
example_df['C'] = example_df['A'] * 2
example_df

In [None]:
example_df.loc[0, 'A'] =  10
example_df

In [None]:
example_df['D'] = example_df['A']
example_df

In [None]:
example_df.loc[0, 'A'] =  20
example_df

In [None]:

slice_df = example_df.loc[0:3]  
slice_df


In [None]:
slice_df.iloc[0] = (0, 0, 0, 0)
slice_df

In [None]:
example_df

In [None]:
slice_df = example_df.loc[0:3].copy()  # This creates a view on the original DataFrame
slice_df


In [None]:
slice_df.iloc[0] = (1, 1, 1, 1)
slice_df

In [None]:
example_df

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Chained indexing, potentially leading to SettingWithCopyWarning
example_2_df = df[df['A'] > 1]
example_2_df

In [None]:
example_2_df['B'] = [50, 60] 

In [None]:
example_2_df

In [None]:
df

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Chained indexing, potentially leading to SettingWithCopyWarning
example_2_df = df[df['A'] > 1]
example_2_df

In [None]:
df.loc[df['A'] > 1, 'B'] = [50, 60]
df

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

subset_df = df.loc[df['A'] > 1]
subset_df['C'] = subset_df['B'] + 10  

In [None]:
subset_df

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

df.loc[df['A'] > 1, 'C'] = df.loc[df['A'] > 1, 'B'] + 10

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df[df['A']> 1]['B']

In [None]:
x = df[df['A']> 1]['B']
x._is_view

In [None]:
x = example_df.loc[example_df['A']> 1, 'B']
x._is_view

##### example
```python
import pandas, numpy
num_states = 50
def PROCESSDATA(file_name, int):
    global df
    df = pd.read_csv(file_name)
    df = df.fillna(df.median())
    if int > 0: df = df.sample(int)
```


### Improving Readability and Maintainability by Adhering to PEP8
* The PEP8 is Python's official style guide,
  * Python Enhancement Proposal: a design document providing information to the Python community, or describing a new feature for Python or its processes or environment. 
  * PEPs are intended as the primary mechanisms for proposing major new features, collecting community input on an issue, and documenting the design decisions that have gone into Python. T
 * PEP8:
   * Naming vairables and importing packages
   * Avoiding compound statements
   * Avoiding builtin functions or keywords as variable names
   * Why to avoid global variables

### example 
* Avoid compound aliases and non-standard aliases

```python
import padas as ada, numpy as ny, matplotlib.pyplot as mtb
```
* One import per line, use standard aliases

```python
import padas as pd, numpy as np, matplotlib.pyplot as plt
import padas as ada, numpy as ny, matplotlib.pyplot as mtb
```



### Important PEP8  Recommendations

* Avoid compound statement
```python
if x > 0: df1 = df.sample(10); df2 = df.sample(20); x +=1;
```    

```python
if income > 0:
    df = df.sample(nb)
```    
* Write variable names appropriately
  * Use snake_case for variables and functions, UPPER_CASE for constants, and lowercase for general variables.
* Functions should be in lowercase and use snake_case to improve readability.
* Classes should be CamelCase and start with an uppercase letter to distinguish them from functions and variables.



### Important PEP8  Recommendations - Cont'd

* Never use a builtin function or keyword as a variable name
  * I see it often with `str`, `list`, `id`, `sum`, `min`, `max`, `input` etc. 
  * Make sure you know what variables or function your package is importing so you don't overwrite them

* Avoid global variables
 * considered bad practice from a styling perspective 
   * Changes to global variables can be made from anywhere in the program, making it difficult to understand how and when their values are modified.
   *  Functions that rely on global variables are closely tied to those variables
     * Reducing modularity and making the code harder to maintain or reuse.
  * Testing functions that depend on global variables can be challenging 


### Important PEP8  Recommendations - Cont'd

* Use Spaces Appropriately:
  * Around arithmetic operators (+, -, *, /)
  * Comparison operators (==, !=, <, >, <=, >=)
  * Assignments (=) for better readability. 
   ```x = 1 + 2``
   
* After commas
  `my_function(arg1, arg2, arg3)` not `my_function(arg1,arg2,arg3)`

* Separate functions and classes with two blank lines

* Single blank line to separate logical sections inside functions

* Avoid excessive blank lines

#### Use Function and Modularize Your Code
* Define functions to encapsulate reusable code.
  
* Functions in modules help break down complex code into smaller, manageable parts making the notebook more organized and easier to read.


* Functions can be individually tested to ensure they perform as expected. 
  * This is much harder with code that is not broken down into modules or functions.
  
* Use defensive programming when your code will be used by others 

In [None]:

* In a separate python file (e.g., `modulename.py`)
```python

import random
def magic_addition(a, b)
    retun a + b + random.randint(a, b)
```

* In your notebook 
```
import modulename as mn
mn.magic_addition(10, 20)
```


### Document and Test your Functions Appropriately

* Type hints can be simple classes like float or str or be more complex. 
  * The typing module provides a vocabulary of more advanced type hints.
  * Using pydantic to crete more advanced type hints and handle validation
* Increasingly critical to document complex functions
  * Makes it easy to know what the function expects as input, returns and what it does.
  * Makes it easier for generative code to know to interpret and match natural language to your code
  

In [None]:
### Specifying Parameter Data Types in a Function
import random
def magic_addition(a:int, b:int) -> int:
    return a + b + random.randint(a, b)

In [None]:
#magic_addition()

In [None]:
magic_addition(1, 10.0)

In [None]:
magic_addition("one", "test")

In [None]:
import random
def magic_addition(a:int, b:int) -> int:
    if type(a) is not int or type(b) is not int:
        raise TypeError(f"the inputs a and b must be ints> you provided a: {a} and b: {b} ")
    return a + b + random.randint(a, b)


In [None]:
magic_addition(1, "Test")

* Using more complex type hints

```python
type Vector = list[float]

def scale(scalar: float, vector: Vector) -> Vector:
    return [scalar * num for num in vector]
```

```python
from collections.abc import Sequence

type ConnectionOptions = dict[str, str]
type Address = tuple[str, int]
type Server = tuple[Address, ConnectionOptions]

def broadcast_message(message: str, servers: Sequence[Server]) -> None:
    pass
```

In [None]:
from pydantic import BaseModel


class Foo(BaseModel):
    a: str
    b: int

class Bar(BaseModel):
    c: str
    d: int

def some_function(x: Foo) -> Bar:
    print(f"passed {x}")
    

f =Foo(a="A", b=1)
some_function(f)


In [None]:
f = Foo(a=1, b=1)


In [None]:
f.model_dump()

In [None]:
f.model_dump_json()

### Documenting your Modules and Functions

```python
"""
This module provides utility functions for mathematical operations.

The functions include simple arithmetic operations like addition and multiplication.

Dependencies:
math (standard library)

Usage:
from my_module import add
result = add(5, 3)
"""

def add(a, b):
    """
    Add two numbers and return the result.

    Parameters:
    a (int or float): The first number.
    b (int or float): The second number.

    Returns:
    int or float: The sum of the two numbers.
    """
    return a + b
```

### Working with Classes

* Benefits of object oriented programming are similar to those listed for functions and modules above.
  * create modular resuable code . 
  * Reduce redundancy and increases efficiency.
  * Improve maintainability


In [None]:
from typing import List
import numpy as np
class Data:
    
    def __init__(self, data: List):
        if type(data) is not np.array:
            data = np.array(data)
        self.data = data    
        
    def start_at_zero(self):
        """doc goes here"""
        return self.data - self.data[0]
        
        

In [None]:
d = Data([3, 7, 2, 3])
d.data

In [None]:
d.start_at_zero()

In [None]:
class DataCleaner:
    def __init__(self, df):
        self.df = df

    def _remove_missing_values(self):
        """Remove rows from the DataFrame that contain missing values."""
        self.df.dropna(inplace=True)
        return self.df

    def _replace_values(self, column, old_value, new_value):
        """Replace specific values in a DataFrame column."""
        self.df[column] = self.df[column].replace(old_value, new_value)
        return self.df
    
    def to_custom_json(self):
        json_repr = {x: data[x].tolist() for x in data.columns}
        json_repr["index"]  = self.df.index.tolist()
        
        return json_repr


In [None]:
data = pd.DataFrame({
    'Age': [25, 30, 35, 40, None],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male'],
    'Income': [50000, 60000, 75000, 65000, 62000]
})

{x: data[x].tolist() for x in data.columns}

In [None]:

# Instantiate the DataCleaner with the DataFrame
df_cleaner = DataCleaner(data)


In [None]:
df_cleaner.to_custom_json()

In [None]:
df_cleaner.remove_missing_values()

In [None]:
df_cleaner.replace_values('Gender', 'Male', 'M')

In [None]:
class IncomeGenderDataCleaner(DataCleaner):

    
    
    def clean_data(self):
        self.df = self.replace_values('Gender', 'Male', 'M')
        self.df = self.replace_values('Gender', 'Female', 'F')
        self.df = self.remove_missing_values()
        
        
        


In [None]:
income_clearner = IncomeGenderDataCleaner(data)
income_clearner.df

In [None]:
class IncomeGenderDataCleanerDefault(IncomeGenderDataCleaner):

    def __init__(self, df):
        super().__init__(df)
        self.clean_data()

        
    
    def _clean_data(self):
        self.df = self.replace_values('Gender', 'Male', 'M')
        self.df = self.replace_values('Gender', 'Female', 'F')
        self.df = self.remove_missing_values()
        
        
        


In [None]:
custom_income_clearner = IncomeGenderDataCleanerDefault(data)
custom_income_clearner.df

# Memory Profiling and Managment

* Priofiling is importatn for identifying memory consumption by different parts of your code.
 * Memory profiling helps optimize resource usage and improve performance.
 
 * Various packages available for detailed memory usage analysis.
   * Perform tasks such as line-by-line profiling or aggregate data type memory usage.
   * `guppy` is a popular package for profiling

* Profile is useful for 
  * Quickly identification of potential memory overuse.
  * Helps in optimizing data handling and storage efficiency.


In [1]:
from guppy import hpy 
data_type_summary = hpy()
data_type_summary.heap()

Partition of a set of 317194 objects. Total size = 38867667 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0  98793  31 10334874  27  10334874  27 str
     1  73612  23  5269480  14  15604354  40 tuple
     2  21065   7  3742929  10  19347283  50 types.CodeType
     3  40813  13  3131247   8  22478530  58 bytes
     4   8485   3  2947376   8  25425906  65 dict (no owner)
     5   2795   1  2791824   7  28217730  73 type
     6  19046   6  2590256   7  30807986  79 function
     7   2795   1  1358392   3  32166378  83 dict of type
     8    978   0  1345728   3  33512106  86 dict of module
     9    837   0   558904   1  34071010  88 set
<919 more rows. Type e.g. '_.more' to view.>

In [2]:
34068387/1024/1024

32.490145683288574

In [3]:
import pandas as pd
data_type_summary.heap()

Partition of a set of 464920 objects. Total size = 60982931 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0 137751  30 18254544  30  18254544  30 str
     1 108952  23  7923720  13  26178264  43 tuple
     2  31372   7  5575376   9  31753640  52 types.CodeType
     3  15958   3  5033832   8  36787472  60 dict (no owner)
     4  59612  13  4694850   8  41482322  68 bytes
     5  28617   6  3891912   6  45374234  74 function
     6   3856   1  3625584   6  48999818  80 type
     7   1421   0  2201928   4  51201746  84 dict of module
     8   3411   1  1736040   3  52937786  87 dict of type
     9   1088   0   781824   1  53719610  88 set
<1207 more rows. Type e.g. '_.more' to view.>

In [4]:
53722744/1204/1024

43.57443287998339

In [6]:
col_names = chars = [chr(i) for i in range(ord('A'), ord('M') + 1)]
col_names

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

In [7]:
data_type_summary.heap()

Partition of a set of 510858 objects. Total size = 64884092 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0 144915  28 19102840  29  19102840  29 str
     1 109232  21  7968672  12  27071512  42 tuple
     2  31371   6  5579096   9  32650608  50 types.CodeType
     3  15969   3  5047928   8  37698536  58 dict (no owner)
     4  59619  12  4695871   7  42394407  65 bytes
     5  28474   6  3872464   6  46266871  71 function
     6   3856   1  3627984   6  49894855  77 type
     7   1422   0  2202568   3  52097423  80 dict of module
     8   3411   1  1736040   3  53833463  83 dict of type
     9  10631   2  1060448   2  54893911  85 list
<1340 more rows. Type e.g. '_.more' to view.>

In [8]:
data = pd.DataFrame({x:range(1000000) for x in col_names})

In [9]:
data_type_summary.heap()

Partition of a set of 510880 objects. Total size = 272889682 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0     78   0 104009340  38 104009340  38 numpy.ndarray
     1      1   0 104000144  38 208009484  76 pandas.core.frame.DataFrame
     2 144922  28 19104213   7 227113697  83 str
     3 109231  21  7968608   3 235082305  86 tuple
     4  31371   6  5579305   2 240661610  88 types.CodeType
     5  15973   3  5049352   2 245710962  90 dict (no owner)
     6  59621  12  4696065   2 250407027  92 bytes
     7  28471   6  3872056   1 254279083  93 function
     8   3856   1  3628648   1 257907731  95 type
     9   1422   0  2202568   1 260110299  95 dict of module
<1352 more rows. Type e.g. '_.more' to view.>

In [20]:
104000144/1024/1024

99.18226623535156

In [10]:
_.more

 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
    10   3411   1  1736040   1 261846339  96 dict of type
    11  10633   2  1060864   0 262907203  96 list
    12   1189   0   807224   0 263714427  97 set
    13  25799   5   744152   0 264458579  97 int
    14   5381   1   387432   0 264846011  97 builtins.weakref
    15    365   0   383056   0 265229067  97 abc.ABCMeta
    16   1964   0   362368   0 265591435  97 dict of function
    17   2440   0   351360   0 265942795  97 dict of ast.Name
    18   3837   1   245568   0 266188363  98 types.GetSetDescriptorType
    19    365   0   221736   0 266410099  98 dict of abc.ABCMeta
<1342 more rows. Type e.g. '_.more' to view.>

In [16]:
data.dtypes

A    int64
B    int64
C    int64
D    int64
E    int64
F    int64
G    int64
H    int64
I    int64
J    int64
K    int64
L    int64
M    int64
dtype: object

In [33]:
len(col_names)

13

In [17]:
data.shape

(1000000, 13)

In [32]:
data.shape[0] * data.shape[1] * 8 

112000000

In [31]:
data.index.dtype

dtype('int64')

In [25]:
detailed_memory_usage = data.memory_usage(deep=True, )
detailed_memory_usage

Index        128
A        8000000
B        8000000
C        8000000
D        8000000
E        8000000
F        8000000
G        8000000
H        8000000
I        8000000
J        8000000
K        8000000
L        8000000
M        8000000
dtype: int64

In [38]:
8000000/1024/1024 * len(col_names)

99.18212890625

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   A       1000000 non-null  int64
 1   B       1000000 non-null  int64
 2   C       1000000 non-null  int64
 3   D       1000000 non-null  int64
 4   E       1000000 non-null  int64
 5   F       1000000 non-null  int64
 6   G       1000000 non-null  int64
 7   H       1000000 non-null  int64
 8   I       1000000 non-null  int64
 9   J       1000000 non-null  int64
 10  K       1000000 non-null  int64
 11  L       1000000 non-null  int64
 12  M       1000000 non-null  int64
dtypes: int64(13)
memory usage: 99.2 MB


### Using  The Correct DataType
```
    int8: -128 to 127
    int16: -32,768 to 32,767
    int32: -2,147,483,648 to 2,147,483,647
    int64: -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
```

In [47]:
data_int_32 = pd.DataFrame({x:range(1000000) for x in col_names}, dtype='int32')
data_int_32.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   A       1000000 non-null  int32
 1   B       1000000 non-null  int32
 2   C       1000000 non-null  int32
 3   D       1000000 non-null  int32
 4   E       1000000 non-null  int32
 5   F       1000000 non-null  int32
 6   G       1000000 non-null  int32
 7   H       1000000 non-null  int32
 8   I       1000000 non-null  int32
 9   J       1000000 non-null  int32
 10  K       1000000 non-null  int32
 11  L       1000000 non-null  int32
 12  M       1000000 non-null  int32
dtypes: int32(13)
memory usage: 49.6 MB


In [55]:
import random
data_int_8 = pd.DataFrame({x:[random.randint(0,10) for i in range(1000000)] for x in col_names}, dtype='int8')
data_int_8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   A       1000000 non-null  int8 
 1   B       1000000 non-null  int8 
 2   C       1000000 non-null  int8 
 3   D       1000000 non-null  int8 
 4   E       1000000 non-null  int8 
 5   F       1000000 non-null  int8 
 6   G       1000000 non-null  int8 
 7   H       1000000 non-null  int8 
 8   I       1000000 non-null  int8 
 9   J       1000000 non-null  int8 
 10  K       1000000 non-null  int8 
 11  L       1000000 non-null  int8 
 12  M       1000000 non-null  int8 
dtypes: int8(13)
memory usage: 12.4 MB


### Using Progress Bars

* Feedback on long operations is considered a good practive.
  * When running scripts that take time, having a progress bar improves the user experience 
  * Reassurance that process is ongoing and providing a sense of how much longer it will take.
  
* Very easy to add progress bars across scenarios

In [63]:
from tqdm import tqdm
import time
for i in tqdm(range(100), desc="Doing my magic"):
    time.sleep(0.1)
for i in tqdm(range(100), desc="Doing something else"):
    time.sleep(0.1)    

Doing my magic: 100%|██████████████████████████████████████████████████████████████████████| 100/100 [00:10<00:00,  9.30it/s]
Doing something else: 100%|████████████████████████████████████████████████████████████████| 100/100 [00:10<00:00,  9.33it/s]


In [57]:

tqdm.pandas()

df['processed_column'] = df['raw_column'].progress_apply(some_processing_function)