In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from Neural_Network_model.MLP_network import MLP_network
from Softmax_model.Softmax import SoftmaxClassifier
from sklearn.linear_model import LogisticRegression
from Decision_Trees_model.decision_trees import RandomForest_classification
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from tabulate import tabulate
from sdv.evaluation.single_table import evaluate_quality
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight

In [None]:
df = pd.read_csv("C:/Users/zheny/PycharmProjects/Machine_learning_human_activity/Datasets/pirvision_office_dataset1.csv")

label_map = {
    0: 'Not presence', 1: 'Stationary presence', 3: 'Moving presence'
}

y = df["Label"] = df["Label"].map(label_map)
print(np.unique(y, return_counts=True))

x = df.drop(["Label", "Date", "Time"], axis=1)
feature_cols = [c for c in df.columns if c not in ["Label", "Date", "Time"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

x_train_pir = x_train.reset_index(drop=True)
x_test_pir = x_test.reset_index(drop=True)
y_train_pir = y_train.reset_index(drop=True)
y_test_pir = y_test.reset_index(drop=True)

print(f" har_train = {np.unique(y_train_pir, return_counts=True)}")
print(f" test = {np.unique(y_test_pir, return_counts=True)}")

rus = RandomUnderSampler(random_state=42, sampling_strategy={'Not presence': 2300})
x_balanced, y_balanced = rus.fit_resample(x_train_pir, y_train_pir)
df_rus = pd.concat([x_balanced, y_balanced.rename("Label")], axis=1)
print(f" balanced = {np.unique(y_balanced, return_counts=True)}")

synthetic_parts = []
for cls, target_n in [('Stationary presence', 2300), ('Moving presence', 2300)]:
    df_cls = df_rus[df_rus["Label"] == cls].drop(columns="Label")
    n_to_gen = target_n - len(df_cls)
    if n_to_gen <= 0:
        continue

    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(df_cls)

    for col in feature_cols:
        metadata.update_column(column_name=col, sdtype="numerical")

    synth_model = CTGANSynthesizer(
        metadata=metadata,
        generator_dim=(512, 512),
        discriminator_dim=(512, 512),
        embedding_dim=128,
        batch_size=2000,
        epochs=1000,
        pac=5,
        cuda=True
    )

    synth_model.fit(df_cls)
    new_rows = synth_model.sample(num_rows=n_to_gen)

    new_rows["Label"] = cls
    synthetic_parts.append(new_rows)

df_balanced = pd.concat([df_rus] + synthetic_parts, ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
print(f" final balanced = {np.unique(df_balanced['Label'], return_counts=True)}")

x_train_pir = df_balanced.drop(['Label'], axis=1)
y_train_pir = df_balanced['Label']

se = StandardScaler()
x_train_scaled = se.fit_transform(x_train_pir)
x_test_scaled = se.fit_transform(x_test)

model = MLP_network()
model.fit(x_train_scaled, y_train_pir)
y_pred_test = model.predict(x_test_scaled)
y_pred_train = model.predict(x_train_scaled)

"""model.fit(x_train, y_train)
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)
"""
print(f"Accuracy on training data: {accuracy_score(y_train_pir, y_pred_train)}")
print(f"Accuracy: {accuracy_score(y_test_pir, y_pred_test)}")
print(f"Confusion matrix:\n{confusion_matrix(y_test_pir, y_pred_test)}")
print(f"Classification report:\n{classification_report(y_test_pir, y_pred_test)}")

In [None]:
report = evaluate_quality(x_train, x_train_pir, metadata=metadata)
#print(report.get_details())
#print(report.get_score())

In [2]:
credit_train_df = pd.read_csv("Datasets/credit_cleaned_train.csv", low_memory=False)
credit_train_df = credit_train_df.drop(columns=["ID", 'Customer_ID', 'Month', 'Name'], axis=1)

print(credit_train_df.info())
print(credit_train_df.head())
print("Credit Score distribution:", np.unique(credit_train_df['Credit_Score'], return_counts=True))

type_of_loan_position = credit_train_df.columns.get_loc('Type_of_Loan')

loan_map = {'No Data': 'No Data loan', 'not specified': 'not specified loan'}
credit_train_df['Type_of_Loan'] = credit_train_df['Type_of_Loan'].map(loan_map)

main_loan_types = [
    'No Data loan',
    'not specified loan',
    'credit-builder loan',
    'personal loan',
    'debt consolidation loan',
    'student loan',
    'payday loan',
    'mortgage loan',
    'auto loan',
    'home equity loan'
]

def process_loans(loan_string):
    if pd.isna(loan_string):
        result = np.zeros(len(main_loan_types))
        result[main_loan_types.index('No Data loan')] = 1
        return result

    loan_str = str(loan_string)
    loan_lower = loan_str.lower()

    result = np.zeros(len(main_loan_types))

    if 'no data loan' in loan_lower:
        result[main_loan_types.index('No Data loan')] = 1
        return result

    loans = [loan.strip() for loan in loan_str.split(',')]

    for loan in loans:
        loan_lower = loan.lower()
        found = False

        for i, loan_type in enumerate(main_loan_types[2:]):
            if loan_type in loan_lower:
                result[i+2] += 1
                found = True
                break


    return result

loan_vectors = credit_train_df['Type_of_Loan'].apply(process_loans)

for i, loan_type in enumerate(main_loan_types):
    col_name = loan_type.replace(" ", "_").replace("-", "_")
    credit_train_df[f'has_{col_name}'] = loan_vectors.apply(lambda x: 1 if x[i] > 0 else 0)

credit_train_df.drop('Type_of_Loan', axis=1, inplace=True)

all_columns = credit_train_df.columns.tolist()

new_columns = [col for col in all_columns if col.startswith('has_')]

for col in new_columns:
    all_columns.remove(col)

all_columns = all_columns[:type_of_loan_position] + new_columns + all_columns[type_of_loan_position:]

credit_train_df = credit_train_df[all_columns]

print(credit_train_df.head())
print(credit_train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 24 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Age                       100000 non-null  float64
 1   SSN                       100000 non-null  float64
 2   Occupation                100000 non-null  object 
 3   Annual_Income             100000 non-null  float64
 4   Monthly_Inhand_Salary     100000 non-null  float64
 5   Num_Bank_Accounts         100000 non-null  float64
 6   Num_Credit_Card           100000 non-null  float64
 7   Interest_Rate             100000 non-null  float64
 8   Num_of_Loan               100000 non-null  float64
 9   Type_of_Loan              100000 non-null  object 
 10  Delay_from_due_date       100000 non-null  float64
 11  Num_of_Delayed_Payment    100000 non-null  float64
 12  Changed_Credit_Limit      100000 non-null  float64
 13  Num_Credit_Inquiries      100000 non-null  fl

In [3]:
categorical_columns = ['Occupation', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']

column_positions = {}
for col in categorical_columns:
    column_positions[col] = credit_train_df.columns.get_loc(col)

credit_train_df = pd.get_dummies(credit_train_df, columns=categorical_columns, prefix=categorical_columns)

bool_columns = credit_train_df.select_dtypes(include=['bool']).columns
credit_train_df[bool_columns] = credit_train_df[bool_columns].astype(int)

all_columns = credit_train_df.columns.tolist()

for col in categorical_columns:
    one_hot_cols = [c for c in all_columns if c.startswith(col + '_')]

    for one_hot_col in one_hot_cols:
        all_columns.remove(one_hot_col)

    position = column_positions[col]
    all_columns = all_columns[:position] + one_hot_cols + all_columns[position:]

credit_train_df = credit_train_df[all_columns]

print(credit_train_df.info())
print(credit_train_df.head())

x_credit = credit_train_df.drop('Credit_Score', axis=1).to_numpy()
y_credit = credit_train_df['Credit_Score'].to_numpy()

x_train_credit, x_test_credit, y_train_credit, y_test_credit = train_test_split(x_credit, y_credit, test_size=0.3, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 56 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   Age                                                 100000 non-null  float64
 1   SSN                                                 100000 non-null  float64
 2   Occupation_Accountant                               100000 non-null  int64  
 3   Occupation_Architect                                100000 non-null  int64  
 4   Occupation_Developer                                100000 non-null  int64  
 5   Occupation_Doctor                                   100000 non-null  int64  
 6   Occupation_Engineer                                 100000 non-null  int64  
 7   Occupation_Entrepreneur                             100000 non-null  int64  
 8   Occupation_Journalist                               100000 non-nu

In [4]:
model = SoftmaxClassifier(
        learning_rate=0.1,
        max_iter=100000,
        eps=1e-6,
        lambda_reg=1,
        use_pca=False
    )

model.fit(x_train_credit, y_train_credit)
y_pred = model.predict(x_test_credit)

le = LabelEncoder()
y_train_enc = le.fit(y_train_credit)
y_pred_enc = le.inverse_transform(y_pred)

print(f"Confusion matrix: \n{confusion_matrix(y_test_credit, y_pred_enc)}")
print(f"Classification report: \n{classification_report(y_test_credit, y_pred_enc)}")

Обучение модели:


  1%|          | 547/100000 [00:10<32:32, 50.93it/s]


Confusion matrix: 
[[   66    80  5176]
 [    5  3680  5120]
 [   26  2373 13474]]
Classification report: 
              precision    recall  f1-score   support

        Good       0.68      0.01      0.02      5322
        Poor       0.60      0.42      0.49      8805
    Standard       0.57      0.85      0.68     15873

    accuracy                           0.57     30000
   macro avg       0.62      0.43      0.40     30000
weighted avg       0.60      0.57      0.51     30000



In [5]:
le = LabelEncoder()
y_train_credit_enc = le.fit_transform(y_train_credit)
y_test_credit_enc = le.transform(y_test_credit)

sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y_train_credit_enc
)

model = XGBClassifier(n_estimators=100, random_state=42)
model.fit(x_train_credit, y_train_credit_enc, sample_weight=sample_weights)
y_pred = model.predict(x_test_credit)
y_pred_labels = le.inverse_transform(y_pred)

print(f"Accuracy score: {accuracy_score(y_test_credit, y_pred_labels)}")
print(f"Confusion matrix:\n{confusion_matrix(y_test_credit, y_pred_labels)}")
print(f"Classification report:\n{classification_report(y_test_credit, y_pred_labels)}")

Accuracy score: 0.7603666666666666
Confusion matrix:
[[ 4610    75   637]
 [  489  7331   985]
 [ 2466  2537 10870]]
Classification report:
              precision    recall  f1-score   support

        Good       0.61      0.87      0.72      5322
        Poor       0.74      0.83      0.78      8805
    Standard       0.87      0.68      0.77     15873

    accuracy                           0.76     30000
   macro avg       0.74      0.79      0.75     30000
weighted avg       0.78      0.76      0.76     30000

