In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import os
import sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
# Get the absolute path of the scripts directory
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts'))
sys.path.append(scripts_path)

In [6]:
from feature_enginering import create_aggregate_features,extract_datetime_features,encode_categorical,handle_missing_values,scale_features

In [7]:
# Load data
filepath = r"C:\Users\HP\week 6\Credit-Scoring-Model\data\data.csv"
df = pd.read_csv(filepath)


In [5]:
df.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value',
       'TransactionStartTime', 'PricingStrategy', 'FraudResult'],
      dtype='object')

In [6]:
# Convert 'TransactionDate' to datetime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

In [7]:
# Apply aggregate features
df = create_aggregate_features(df)

In [8]:
# Apply datetime feature extraction
df = extract_datetime_features(df)

In [None]:
# Apply One-Hot Encoding on categorical features
df_encoded = encode_categorical(df)

In [9]:
# Handle missing values in the dataset
df = handle_missing_values(df)

In [None]:
df=scale_features(df)

In [20]:
# Assuming df is your DataFrame
# Define risk factors and assign scores
def calculate_rfsm_score(row):
    score = 0
    
    # Example of scoring based on Total_Transaction_Amount
    if row['Total_Transaction_Amount'] > 5000:
        score += 3  # High risk for large transactions
    elif 1000 < row['Total_Transaction_Amount'] <= 5000:
        score += 2  # Medium risk
    else:
        score += 1  # Low risk for small transactions
    
    # Example of scoring based on Transaction_Count
    if row['Transaction_Count'] > 50:
        score += 3
    elif 20 < row['Transaction_Count'] <= 50:
        score += 2
    else:
        score += 1
    
    # Scoring based on ProductCategory_financial_services
    if row['ProductCategory_financial_services'] == 1:
        score += 3
    
    # Additional example scoring for another product category
    if row['ProductCategory_transport'] == 1:
        score += 2  # Medium risk for transport-related services

    return score

In [21]:
# Apply the RFSM scoring function to the dataset
df['RFSM_Score'] = df.apply(calculate_rfsm_score, axis=1)

In [22]:
# Calculate quantiles for the RFSM score
low_risk_threshold = df['RFSM_Score'].quantile(0.33)  # 33rd percentile
medium_risk_threshold = df['RFSM_Score'].quantile(0.66)  # 66th percentile

In [23]:
# Define credit score categories based on quantiles
def classify_credit_score_global(score, low_thresh, medium_thresh):
    if score > medium_thresh:
        return 'High Risk'
    elif score > low_thresh:
        return 'Medium Risk'
    else:
        return 'Low Risk'

In [24]:
# Classify users based on their RFSM scores and global criterion
df['Credit_Score_Category'] = df['RFSM_Score'].apply(
    lambda x: classify_credit_score_global(x, low_risk_threshold, medium_risk_threshold)
)

In [None]:
# Show a few rows to verify
print(df[['RFSM_Score', 'Credit_Score_Category']].head())

In [None]:
# Visualize the distribution of risk categories using a count plot
sns.countplot(data=df, x='Credit_Score_Category')
plt.title('Distribution of Credit Score Categories (Global Criterion)')
plt.xlabel('Credit Score Category')
plt.ylabel('Count')
plt.show()

In [41]:
# Save processed data
#df.to_csv("C:\\Users\\HP\\week 6\\Credit-Scoring-Model\\data\\processed_data1.csv", index=False)

In [None]:
# Display the first few rows of the processed dataset
#df.head()