In [22]:
import math
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [23]:
PRINCIPAL: int = 1_000_000
Z_FACTOR: int = 5
K_VALUE: float = 0.8
EMISSION_RISK_INDEX: float = 0.05
ENVIRONMENTAL_FEE_INDEX: int = 0
THRESHOLD: float = 0.75
PATH = "../datasets/companies_final.csv"

INDUSTRY_CODES = {
    "information technology and services": 0,
    "military": 1,
    "accounting": 2,
    "retail": 3,
    "computer software": 4,
    "telecommunications": 5,
    "defense & space": 6,
    "financial services": 7,
    "management consulting": 8,
    "banking": 9,
}

In [24]:
def preprocess_data(file_path):
    return pd.read_csv(file_path).iloc[:2000]

In [25]:
def calculate_investment_risk(row: pd.Series, principal: int, k_value: float, data: pd.DataFrame) -> float:
    beta: float = 1.0
    principal_investment: int = principal
    z_factor: int = Z_FACTOR
    emission_factor: float = k_value * row["emissions"] + 1
    risk_investment: float = EMISSION_RISK_INDEX * principal
    environmental_fee: int = ENVIRONMENTAL_FEE_INDEX
    industry_count: int = len(data[data["industry"] == row["industry"]])
    disaster_risk_score: int = {"Low": 0, "Medium": 1, "High": 2}.get(row["disaster_risk"], 0)

    importance_factor: float = row["importance"] if row["importance"] != 0 else 0.001
    denominator: float = max(emission_factor * importance_factor * math.log(abs(emission_factor * importance_factor)), 1)

    numerator: float = principal_investment * z_factor
    investment_vs_capital: float = math.log(industry_count / principal_investment) if industry_count > principal_investment else 0

    investment_risk: float = (
        beta / principal_investment * ((numerator / denominator) - risk_investment * emission_factor)
        - environmental_fee
        + investment_vs_capital
        - (disaster_risk_score * 1000)
    )
    return investment_risk

In [26]:
# Preprocess data
data = preprocess_data(PATH)

# Calculate the investment risk for each data point and scale em
data["X"] = data.apply(calculate_investment_risk, axis=1, args=(PRINCIPAL, K_VALUE, data))
scaler = MinMaxScaler(feature_range=(0, 1))
data["X"] = scaler.fit_transform(data[["X"]])

# We're tryin to remove the outliers here using IQR
q1 = data["X"].quantile(1 - THRESHOLD)
q3 = data["X"].quantile(THRESHOLD)
iqr = q3 - q1
data = data[(data["X"] >= q1 - 1.5 * iqr) & (data["X"] <= q3 + 1.5 * iqr)]

In [27]:
data["industry"] = data["industry"].map(INDUSTRY_CODES)
data["disaster_risk"] = data["disaster_risk"].map({"Low": 1, "Medium": 2, "High": 3})

data["Invest"] = (data["X"] > THRESHOLD).astype(int)
X = data.drop(["Invest", "name", "Unnamed: 0", "locality", "country", "X"], axis=1)
Y = data["Invest"]


In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42
)

dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

In [29]:
# accuracy report
y_pred = dt_classifier.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy:  1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       521

    accuracy                           1.00       521
   macro avg       1.00      1.00      1.00       521
weighted avg       1.00      1.00      1.00       521



In [30]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(dt_classifier, f)