# Welcome to the tutorial about mypackage!

In [1]:
pip list

Package                   Version      Editable project location
------------------------- ------------ ----------------------------------------------
anyio                     4.8.0
appnope                   0.1.3
argon2-cffi               21.3.0
argon2-cffi-bindings      21.2.0
asttokens                 2.0.5
async-lru                 2.0.4
attrs                     24.3.0
babel                     2.16.0
beautifulsoup4            4.12.3
bleach                    6.2.0
Bottleneck                1.4.2
Brotli                    1.0.9
certifi                   2025.1.31
cffi                      1.17.1
charset-normalizer        3.3.2
click                     8.1.8
comm                      0.2.1
contourpy                 1.3.1
cycler                    0.11.0
debugpy                   1.8.11
decorator                 5.1.1
defusedxml                0.7.1
distlib                   0.3.9
executing                 0.8.3
fastjsonschema            2.20.0
filelock                  3.17.0
fon

In [2]:
from pygroupf.data_processing import DataProcessor

In [3]:
# Define column types
categorical_cols = [
    "Sex",
    "Housing",
    "Saving accounts",
    "Checking account",
    "Purpose",
]
numerical_cols = ["Age", "Job", "Credit amount", "Duration"]

"""
Encode specific categorical columns with numerical values as per requirements:
- sex: male=1, female=0
- housing: own=2, free=1, rent=0
- saving_accounts: unknown=0, little=1, moderate=2, quite rich=3, rich=4
- checking_account: unknown=0, little=1, moderate=2, rich=3
"""

# Define mapping of categorical values to numerical values
mapping = {
    "Sex": {"male": 1, "female": 0, "unknown": -1},
    "Housing": {"own": 2, "free": 1, "rent": 0, "unknown": -1},
    "Saving accounts": {
        "unknown": 0,
        "little": 1,
        "moderate": 2,
        "quite rich": 3,
        "rich": 4,
    },
    "Checking account": {"unknown": 0, "little": 1, "moderate": 2, "rich": 3},
}

In [4]:
# data processing
processor = DataProcessor("data/german_credit_data.csv")
processor.load_data()
processor.clean_data(categorical_cols, numerical_cols)
processor.encode_categorical_values(mapping)
processed_data = processor.get_processed_data()

Data processing completed successfully!


In [5]:
# Display the processed data
processed_data.head()

Unnamed: 0,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose
0,67,1,2,2,0,1,1169,6,radio/TV
1,22,0,2,2,1,2,5951,48,radio/TV
2,49,1,1,2,1,0,2096,12,education
3,45,1,2,1,1,1,7882,42,furniture/equipment
4,53,1,2,1,1,1,4870,24,car


In [6]:
from pygroupf.analysis import DataAnalyzer

In [7]:
# Define scoring rules for different fields
scoring_rules = {
    'Age': [
        {'condition': lambda x: x < 20 or x > 70, 'score': 15},
        {'condition': lambda x: 20 <= x < 25 or 60 < x <= 70, 'score': 10},
        {'condition': lambda x: 25 <= x < 30 or 50 < x <= 60, 'score': 5}
    ],
    'Sex': {'male': 2, 'female': 0, 'unknown': 1},
    'Job': {0: 15, 1: 10, 2: 5, 3: 1},
    'Housing': {0: 15, 1: 10, 2: 5},
    'Saving accounts': {
        'default': lambda x: (4 - x) * 3 if x > 0 else 10,
        'specific': {0: 10}
    },
    'Checking account': {
        'default': lambda x: (3 - x) * 4 if x > 0 else 10,
        'specific': {0: 10}
    },
    'Credit amount': [
        {'threshold': 8000, 'score': 15},
        {'threshold': 5000, 'score': 10},
        {'threshold': 2000, 'score': 5}
    ],
    'Duration': [
        {'threshold': 36, 'score': 15},
        {'threshold': 24, 'score': 10},
        {'threshold': 12, 'score': 5}
    ],
    'Purpose': {
        'business': 10,
        'education': 10,
        'unknown': 8,
        'car': 5,
        'furniture/equipment': 5,
        'radio/TV': 3,
        'domestic appliances': 3,
        'repairs': 3,
        'vacation/others': 3
    }
}

# Define risk levels based on total score
risk_levels = [
    (70, "High risk"),
    (50, "Medium-high risk"),
    (30, "Medium-low risk"),
    (0, "Low risk")
]

In [8]:
# data analysis and save risk report
# processed_data = pd.read_csv("data/processed_credit_data.csv")
analyzer = DataAnalyzer(processed_data, scoring_rules, risk_levels)
risk_report = analyzer.generate_risk_report() 
analyzer.save_risk_report("data/risk_report.csv")

Risk report saved to data/risk_report.csv


In [9]:
# Display the risk report
risk_report.head()

Unnamed: 0,customer_id,age,sex,job,housing,saving_accounts,checking_account,credit_amount,duration,purpose,risk_score,risk_level
0,1,67,1,2,2,0,1,1169,6,radio/TV,41,Medium-low risk
1,2,22,0,2,2,1,2,5951,48,radio/TV,61,Medium-high risk
2,3,49,1,1,2,1,0,2096,12,education,49,Medium-low risk
3,4,45,1,2,1,1,1,7882,42,furniture/equipment,62,Medium-high risk
4,5,53,1,2,1,1,1,4870,24,car,52,Medium-high risk


In [10]:
from pygroupf.visualization import DataVisualizer

In [11]:
visualizer = DataVisualizer('data/risk_report.csv')
visualizer.visualize_all()

Heatmap saved to image/heatmap.png
Risk level distribution plot saved to image/risk_distribution.png
All visualizations have been generated and saved to the 'image' folder.
