In [None]:
# pip install guppy3

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

### 1. Configuring setting and importing libraries

* When starting a Jupyter notebook, it's considered good practice to configure settings such as plot sizes and display formats at the beginning. 
  * This only makes sense if settings are used throughout the notebook, not to modify a single plot

* This practice helps maintain consistency throughout the notebook and ensures that all visualizations and outputs are standardizes across notebooks



In [None]:
example_df = pd.DataFrame({"A": np.arange(100), "B": np.linspace(0, 55, 100)})

example_df.head()


In [None]:
# Modify the column B so it only contains two decimal points
example_df = pd.DataFrame({"A": np.arange(100), "B": np.linspace(0, 55, 100)})
example_df['B'] = example_df['B'].round(2)
example_df.head()

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
example_df = pd.DataFrame({"A": np.arange(100), "B": np.linspace(0, 55, 100)})
example_df.head()

In [None]:
plt.plot(example_df['B'])

In [None]:
plt.figure(figsize=(14,4))
plt.plot(example_df['B'])

In [None]:
plt.rcParams

In [None]:
plt.rcParams["figure.figsize"] = (14,4)

In [None]:
plt.plot(example_df['B'])

In [None]:
pd.options.display.float_format = None

In [None]:
df = pd.DataFrame({
    'Large Numbers': [100000000000000000000000000.0]
})
df

In [None]:
pd.options.display.max_rows = 5

In [None]:
example_df

In [None]:
# You can also set the display format to suppress scientific notation for floating-point numbers
pd.options.display.float_format = '{:.2f}'.format

In [None]:
df


### 2. Avoiding Unintended References

* A view in pandas is a subset of the original object ( DataFrame or Series ) linked to the original source. 
 * When you create a view, you are not creating a new copy of the data, but rather a reference to the original data. 
 * Changes you make to the view will be reflected in the original data, and vice versa.

* When you work directly with views on the data, instead of simple column assignment, you may end up with a reference to the original data rather than a copy. 
* Also, many of you have experienced the "SettingWithCopyWarning." 
  *  triggered when you try to modify a DataFrame that might be a view on another DataFrame
   * Pandas cannot definitively tell which. 

* If you want to ensure that your modifications do not affect the original DataFrame when working with slices or subsets, you should explicitly create a copy using the .copy() method.

In [None]:
example_df['C'] = example_df['A'] * 2
example_df

In [None]:
example_df.loc[0, 'A'] =  10
example_df

In [None]:
example_df['D'] = example_df['A']
example_df

In [None]:
example_df.loc[0, 'A'] =  20
example_df

In [None]:

slice_df = example_df.loc[0:3]  
slice_df


In [None]:
slice_df.iloc[0] = (0, 0, 0, 0)
slice_df

In [None]:
example_df

In [None]:
slice_df = example_df.loc[0:3].copy()  # This creates a view on the original DataFrame
slice_df


In [None]:
slice_df.iloc[0] = (1, 1, 1, 1)
slice_df

In [None]:
example_df

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Chained indexing, potentially leading to SettingWithCopyWarning
example_2_df = df[df['A'] > 1]
example_2_df

In [None]:
example_2_df['B'] = [50, 60] 

In [None]:
example_2_df

In [None]:
df

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Chained indexing, potentially leading to SettingWithCopyWarning
example_2_df = df[df['A'] > 1]
example_2_df

In [None]:
df.loc[df['A'] > 1, 'B'] = [50, 60]
df

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

subset_df = df.loc[df['A'] > 1]
subset_df['C'] = subset_df['B'] + 10  

In [None]:
subset_df

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

df.loc[df['A'] > 1, 'C'] = df.loc[df['A'] > 1, 'B'] + 10

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
df[df['A']> 1]['B']

In [None]:
x = df[df['A']> 1]['B']
x._is_view

In [None]:
x = example_df.loc[example_df['A']> 1, 'B']
x._is_view

##### example
```python
import pandas, numpy
num_states = 50
def PROCESSDATA(file_name, int):
    global df
    df = pd.read_csv(file_name)
    df = df.fillna(df.median())
    if int > 0: df = df.sample(int)
```


### Improving Readability and Maintainability by Adhering to PEP8
* The PEP8 is Python's official style guide,
  * Python Enhancement Proposal: a design document providing information to the Python community, or describing a new feature for Python or its processes or environment. 
  * PEPs are intended as the primary mechanisms for proposing major new features, collecting community input on an issue, and documenting the design decisions that have gone into Python. T
 * PEP8:
   * Naming vairables and importing packages
   * Avoiding compound statements
   * Avoiding builtin functions or keywords as variable names
   * Avoiding global variables

### example 
* Avoid compound aliases and non-standard aliases

```python
import padas as ada, numpy as ny, matplotlib.pyplot as mtb
```
* One import per line, use standard aliases

```python
import padas as pd, numpy as np, matplotlib.pyplot as plt
import padas as ada, numpy as ny, matplotlib.pyplot as mtb
```



### Important PEP8  Recommendations

* Avoid compound statement
```python
if x > 0: df1 = df.sample(10); df2 = df.sample(20); x +=1;
```    

```python
if income > 0:
    df = df.sample(nb)
```    
* Write variable names appropriately
  * Use snake_case for variables and functions, UPPER_CASE for constants, and lowercase for general variables.
* Functions should be in lowercase and use snake_case to improve readability.
* Classes should be CamelCase and start with an uppercase letter to distinguish them from functions and variables.



In [None]:
for i in range(10):
    sum += i


In [None]:
a = input()
print(a)

In [None]:
import pandas as pd

In [None]:
a = pd.DataFrame({'col_1': [1,2,3]})

def f(some_df):
    some_df['col_1'] = [4,5,6]
    return some_df
f(a)


In [None]:
print(a)

### Important PEP8  Recommendations - Cont'd

* Never use a builtin function or keyword as a variable name
  * I see it often with `str`, `list`, `id`, `sum`, `min`, `max`, `input` etc. 
  * Make sure you know what variables or function your package is importing so you don't overwrite them

* Avoid global variables
 * considered bad practice from a styling perspective 
   * Changes to global variables can be made from anywhere in the program, making it difficult to understand how and when their values are modified.
   *  Functions that rely on global variables are closely tied to those variables
     * Reducing modularity and making the code harder to maintain or reuse.
  * Testing functions that depend on global variables can be challenging 


In [None]:
a=1+2
a = 1 + 2

In [None]:
class X:
    pass


class Y:
    pass






puc

def z:
    pass



### Important PEP8  Recommendations - Cont'd

* Use Spaces Appropriately:
  * Around arithmetic operators (+, -, *, /)
  * Comparison operators (==, !=, <, >, <=, >=)
  * Assignments (=) for better readability. 
   ```x = 1 + 2``
   
* After commas
  `my_function(arg1, arg2, arg3)` not `my_function(arg1,arg2,arg3)`

* Separate functions and classes with two blank lines

* Single blank line to separate logical sections inside functions

* Avoid excessive blank lines

#### Use Function and Modularize Your Code
* Define functions to encapsulate reusable code.
  
* Functions in modules help break down complex code into smaller, manageable parts making the notebook more organized and easier to read.


* Functions can be individually tested to ensure they perform as expected. 
  * This is much harder with code that is not broken down into modules or functions.
  
* Use defensive programming when your code will be used by others 

In [None]:

* In a separate python file (e.g., `modulename.py`)
```python

import random
def magic_addition(a, b)
    retun a + b + random.randint(a, b)
```

* In your notebook 
```
import modulename as mn
mn.magic_addition(10, 20)
```


In [None]:
import random
ran

### Document and Test your Functions Appropriately

* Type hints can be simple classes like float or str or be more complex. 
  * The typing module provides a vocabulary of more advanced type hints.
  * Using pydantic to crete more advanced type hints and handle validation
* Increasingly critical to document complex functions
  * Makes it easy to know what the function expects as input, returns and what it does.
  * Makes it easier for generative code to know to interpret and match natural language to your code
  

In [None]:
### Specifying Parameter Data Types in a Function
import random
def magic_addition(a:int, b:int) -> int:
    return a + b + random.randint(a, b)

magic_addition("A", "B")

In [None]:
#magic_addition()

In [None]:
magic_addition(1, 10.0)

In [None]:
magic_addition("one", "test")

In [None]:
import os
os.path.isfile("/Users/mahdi/Documents/GitHub/phaage-g2p/data/ENA.40.clstr/")


In [None]:
import random
def magic_addition(a:int, b:int) -> int:
    if type(a) is not int or type(b) is not int:
        raise TypeError(f"the inputs a and b must be ints> you provided a: {a} and b: {b} ")
    return a + b + random.randint(a, b)


In [None]:
magic_addition(1, "Test")

* Using more complex type hints

```python
type Vector = list[float]

def scale(scalar: float, vector: Vector) -> Vector:
    return [scalar * num for num in vector]
```

```python
from collections.abc import Sequence

type ConnectionOptions = dict[str, str]
type Address = tuple[str, int]
type Server = tuple[Address, ConnectionOptions]

def broadcast_message(message: str, servers: Sequence[Server]) -> None:
    pass
```

In [None]:
from pydantic import BaseModel


class Foo(BaseModel):
    a: str
    b: int


In [None]:
f = Foo(a = "A", b=2)
f.model_dump_json()

In [None]:
from pydantic import BaseModel


class Foo(BaseModel):
    a: str
    b: int

class Bar(BaseModel):
    c: str
    d: int

def some_function(x: Foo) -> Bar:
    print(f"passed {x}")
    

f =Foo(a="A", b=1)
a = 1
some_function(a)


In [None]:
f = Foo(a=1, b=1)


In [None]:
f.model_dump()

In [None]:
f.model_dump_json()

In [None]:
import random
help(random.randint)

### Documenting your Modules and Functions

```python
"""
This module provides utility functions for mathematical operations.

The functions include simple arithmetic operations like addition and multiplication.

Dependencies:
math (standard library)

Usage:
from my_module import add
result = add(5, 3)
"""

def add(a, b):
    """
    Add two numbers and return the result.

    Parameters:
    a (int or float): The first number.
    b (int or float): The second number.

    Returns:
    int or float: The sum of the two numbers.
    """
    return a + b
```

### Working with Classes

* Benefits of object oriented programming are similar to those listed for functions and modules above.
  * create modular resuable code . 
  * Reduce redundancy and increases efficiency.
  * Improve maintainability


In [None]:
from typing import List
import numpy as np
class Data:    
    def __init__(self, data: List):
        if type(data) is not np.array:
            data = np.array(data)
        self.data = data    
        
    def start_at_zero(self):
        """doc goes here"""
        return self.data - self.data[0]
        
        

In [None]:
d = Data([3, 7, 2, 3])
d.data

In [None]:
d.start_at_zero()

In [None]:
class DataCleaner:
    def __init__(self, df):
        self.df = df

    def _remove_missing_values(self):
        """Remove rows from the DataFrame that contain missing values."""
        self.df.dropna(inplace=True)
        return self.df

    def _replace_values(self, column, old_value, new_value):
        """Replace specific values in a DataFrame column."""
        self.df[column] = self.df[column].replace(old_value, new_value)
        return self.df
    
    def to_custom_json(self):
        json_repr = {x: data[x].tolist() for x in data.columns}
        json_repr["index"]  = self.df.index.tolist()
        
        return json_repr


In [None]:
data = pd.DataFrame({
    'Age': [25, 30, 35, 40, None],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male'],
    'Income': [50000, 60000, 75000, 65000, 62000]
})

#{x: data[x].tolist() for x in data.columns}

In [None]:

# Instantiate the DataCleaner with the DataFrame
df_cleaner = DataCleaner(data)


In [None]:
df_cleaner.to_custom_json()

In [None]:
data

In [None]:
df_cleaner._remove_missing_values()

In [None]:
df_cleaner._replace_values('Gender', 'Male', 'M')

In [None]:
class IncomeGenderDataCleaner(DataCleaner):

    
    
    def clean_data(self):
        self.df = self._replace_values('Gender', 'Male', 'M')
        self.df = self._replace_values('Gender', 'Female', 'F')
        self.df = self._remove_missing_values()
        
        
        


In [None]:
income_clearner = IncomeGenderDataCleaner(data)
income_clearner.df

In [None]:
json.dumps({1:[1,2,4]})

In [None]:
import json 

class IncomeGenderDataCleanerDefault(IncomeGenderDataCleaner):

    def __init__(self, df):
        super().__init__(df)
        self._clean_data()

        
    
    def _clean_data(self):
        self.df = self._replace_values('Gender', 'Male', 'M')
        self.df = self._replace_values('Gender', 'Female', 'F')
        self.df = self._remove_missing_values()
        
    def __repr__(self):
        return json.dumps(self.to_custom_json())
        
        


In [None]:
custom_income_clearner = IncomeGenderDataCleanerDefault(data)
custom_income_clearner 

# Memory Profiling and Managment

* Priofiling is importatn for identifying memory consumption by different parts of your code.
 * Memory profiling helps optimize resource usage and improve performance.
 
 * Various packages available for detailed memory usage analysis.
   * Perform tasks such as line-by-line profiling or aggregate data type memory usage.
   * `guppy` is a popular package for profiling

* Profile is useful for 
  * Quickly identification of potential memory overuse.
  * Helps in optimizing data handling and storage efficiency.


In [6]:
from guppy import hpy 
data_type_summary = hpy()
data_type_summary.heap()

Partition of a set of 363704 objects. Total size = 42837782 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0 105960  29 11183078  26  11183078  26 str
     1  74038  20  5324928  12  16508006  39 tuple
     2  21066   6  3747087   9  20255093  47 types.CodeType
     3  40824  11  3132524   7  23387617  55 bytes
     4   8594   2  2984616   7  26372233  62 dict (no owner)
     5   2795   1  2794200   7  29166433  68 type
     6  19048   5  2590528   6  31756961  74 function
     7   2795   1  1358392   3  33115353  77 dict of type
     8    979   0  1350984   3  34466337  80 dict of module
     9   9851   3   948144   2  35414481  83 list
<1074 more rows. Type e.g. '_.more' to view.>

In [12]:
import pandas as pd
data_type_summary.heap()

Partition of a set of 511284 objects. Total size = 64924173 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0 144931  28 19106687  29  19106687  29 str
     1 109374  21  7978944  12  27085631  42 tuple
     2  31373   6  5579572   9  32665203  50 types.CodeType
     3  15967   3  5048608   8  37713811  58 dict (no owner)
     4  59621  12  4695669   7  42409480  65 bytes
     5  28617   6  3891912   6  46301392  71 function
     6   3856   1  3627576   6  49928968  77 type
     7   1422   0  2203664   3  52132632  80 dict of module
     8   3411   1  1736040   3  53868672  83 dict of type
     9  10631   2  1063720   2  54932392  85 list
<1340 more rows. Type e.g. '_.more' to view.>

In [13]:
64924173/1204/1024

52.659894265209715

In [14]:
col_names = chars = [chr(i) for i in range(ord('A'), ord('M') + 1)]
col_names

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M']

In [15]:
data = pd.DataFrame({x:range(1000000) for x in col_names})
data

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M
0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,2,2,2,2,2,2,2,2,2,2,2,2,2
3,3,3,3,3,3,3,3,3,3,3,3,3,3
4,4,4,4,4,4,4,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999995,999995,999995,999995,999995,999995,999995,999995,999995,999995,999995,999995,999995
999996,999996,999996,999996,999996,999996,999996,999996,999996,999996,999996,999996,999996,999996
999997,999997,999997,999997,999997,999997,999997,999997,999997,999997,999997,999997,999997,999997
999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998,999998


In [16]:
data_type_summary.heap()

Partition of a set of 511554 objects. Total size = 272976926 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0     80   0 104009772  38 104009772  38 numpy.ndarray
     1      1   0 104000144  38 208009916  76 pandas.core.frame.DataFrame
     2 145150  28 19126019   7 227135935  83 str
     3 109392  21  7981624   3 235117559  86 tuple
     4  31425   6  5589384   2 240706943  88 types.CodeType
     5  16016   3  5061128   2 245768071  90 dict (no owner)
     6  59729  12  4706940   2 250475011  92 bytes
     7  28515   6  3878040   1 254353051  93 function
     8   3859   1  3633192   1 257986243  95 type
     9   1424   0  2205480   1 260191723  95 dict of module
<1355 more rows. Type e.g. '_.more' to view.>

In [17]:
104000144/1024/1024

99.18226623535156

In [18]:
data.dtypes

A    int64
B    int64
C    int64
D    int64
E    int64
F    int64
G    int64
H    int64
I    int64
J    int64
K    int64
L    int64
M    int64
dtype: object

In [None]:
len(col_names)

In [19]:
data.shape

(1000000, 13)

In [20]:
data.shape[0] * data.shape[1] * 8 

104000000

In [21]:
104000000 / 1024 / 1024

99.18212890625

In [None]:
data.index.dtype

In [22]:
detailed_memory_usage = data.memory_usage(deep=True, )
detailed_memory_usage

Index        128
A        8000000
B        8000000
C        8000000
D        8000000
E        8000000
F        8000000
G        8000000
H        8000000
I        8000000
J        8000000
K        8000000
L        8000000
M        8000000
dtype: int64

In [None]:
8000000/1024/1024 * len(col_names)

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   A       1000000 non-null  int64
 1   B       1000000 non-null  int64
 2   C       1000000 non-null  int64
 3   D       1000000 non-null  int64
 4   E       1000000 non-null  int64
 5   F       1000000 non-null  int64
 6   G       1000000 non-null  int64
 7   H       1000000 non-null  int64
 8   I       1000000 non-null  int64
 9   J       1000000 non-null  int64
 10  K       1000000 non-null  int64
 11  L       1000000 non-null  int64
 12  M       1000000 non-null  int64
dtypes: int64(13)
memory usage: 99.2 MB


### Using  The Correct DataType
```
    int8: -128 to 127
    int16: -32,768 to 32,767
    int32: -2,147,483,648 to 2,147,483,647
    int64: -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807
```

In [24]:
data_int_32 = pd.DataFrame({x:range(1000000) for x in col_names}, dtype='int32')
data_int_32.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   A       1000000 non-null  int32
 1   B       1000000 non-null  int32
 2   C       1000000 non-null  int32
 3   D       1000000 non-null  int32
 4   E       1000000 non-null  int32
 5   F       1000000 non-null  int32
 6   G       1000000 non-null  int32
 7   H       1000000 non-null  int32
 8   I       1000000 non-null  int32
 9   J       1000000 non-null  int32
 10  K       1000000 non-null  int32
 11  L       1000000 non-null  int32
 12  M       1000000 non-null  int32
dtypes: int32(13)
memory usage: 49.6 MB


In [25]:
import random
data_int_8 = pd.DataFrame({x:[random.randint(0,10) for i in range(1000000)] for x in col_names}, dtype='int8')
data_int_8.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   A       1000000 non-null  int8 
 1   B       1000000 non-null  int8 
 2   C       1000000 non-null  int8 
 3   D       1000000 non-null  int8 
 4   E       1000000 non-null  int8 
 5   F       1000000 non-null  int8 
 6   G       1000000 non-null  int8 
 7   H       1000000 non-null  int8 
 8   I       1000000 non-null  int8 
 9   J       1000000 non-null  int8 
 10  K       1000000 non-null  int8 
 11  L       1000000 non-null  int8 
 12  M       1000000 non-null  int8 
dtypes: int8(13)
memory usage: 12.4 MB


### Using Progress Bars

* Feedback on long operations is considered a good practive.
  * When running scripts that take time, having a progress bar improves the user experience 
  * Reassurance that process is ongoing and providing a sense of how much longer it will take.
  
* Very easy to add progress bars across scenarios

In [None]:
from tqdm import 

import time
for i in tqdm(range(100), desc="Doing my magic"):
    time.sleep(1)


In [None]:
for i in tqdm(range(100), desc="Doing my magic"):
    time.sleep(0.1)

for i in tqdm(range(100), desc="Doing something else"):
    time.sleep(0.1)    

In [None]:

tqdm.pandas()

df['processed_column'] = df['raw_column'].progress_apply(some_processing_function)