# Classification

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.feature_selection import *


df = pd.read_csv('company_train.csv').drop(["ID", "retained_earnings"], axis=1)

df.info()

In [2]:
df['net_profit_margin_category'] = df['net_profit_margin_category'].map({'Low Profit Margin': 0, 'Healthy Profit Margin': 1, 'High Profit Margin': 2})
df['status_label'] = df['status_label'].map({'alive': 1, 'failed': 0})

In [3]:
anchor1 = "gross_profit"
anchor2 = "market_value"
anchor3 = "gross_profit"
anchor4 = "total_long_term_debt"

rolling_n = 3
df[f'{anchor1}_moving_avg'] = df.groupby('company_name')[anchor1].transform(lambda x: x.rolling(rolling_n, 1).mean())
df[f'{anchor2}_moving_avg'] = df.groupby('company_name')[anchor2].transform(lambda x: x.rolling(rolling_n, 1).mean())
df[f'{anchor3}_moving_avg'] = df.groupby('company_name')[anchor3].transform(lambda x: x.rolling(rolling_n, 1).mean())
df[f'{anchor4}_moving_avg'] = df.groupby('company_name')[anchor4].transform(lambda x: x.rolling(rolling_n, 1).mean())
df.drop('company_name', axis=1, inplace=True)

In [None]:
print(len(df[df["net_profit_margin_category"] == 0].index))
print(len(df[df["net_profit_margin_category"] == 1].index))
print(len(df[df["net_profit_margin_category"] == 2].index))


In [None]:
columns_checked = df.columns.difference(["net_profit_margin_category", anchor1, "status_label", "current_ratio"])

for c in columns_checked:
    a = c
    b = anchor1

    data = df[[a, b, "net_profit_margin_category"]]
    display(data)

    plt.figure(figsize=(8, 6))
    plt.scatter(data[data["net_profit_margin_category"] == 3][a],data[data["net_profit_margin_category"] == 3][b], label='High Profit Margin', alpha=0.5)
    plt.scatter(data[data["net_profit_margin_category"] == 2][a],data[data["net_profit_margin_category"] == 2][b], label='Healthy Profit Margin', alpha=0.5)
    plt.scatter(data[data["net_profit_margin_category"] == 1][a],data[data["net_profit_margin_category"] == 1][b], label='Low Profit Margin', alpha=0.5)
    plt.xlabel(a)
    plt.ylabel(b)
    plt.legend()
    plt.show()

In [None]:
no_year_all_numerics = df.drop(["year", "net_profit_margin_category"], axis=1)

corr = no_year_all_numerics.corr()
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [8]:
X = df.drop(["net_profit_margin_category"], axis=1)
y = df["net_profit_margin_category"]

In [9]:
kbest = SelectKBest(score_func=f_classif, k=15)

kbest.fit_transform(X, y)

X = X[kbest.get_feature_names_out()]

In [10]:
paramslg = {
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_logloss',
    'learning_rate': 0.1,
    'num_leaves': 40,
    'max_depth': -1,
    'class_weight': {0: 1, 1: 1.25, 2: 1.25},
}

lgbm = lgb.LGBMClassifier(**paramslg, verbose=0)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

skf = StratifiedKFold(n_splits=5)

display(cross_val_score(lgbm, X, y, cv=skf, scoring='accuracy', verbose=3).mean())
display(cross_val_score(lgbm, X, y, cv=skf, scoring='f1_macro', verbose=3).mean())

In [None]:
test = pd.read_csv('company_test_classif.csv')

test_x = test.drop(["ID"], axis=1)
id = test["ID"]

test_x[f'{anchor1}_moving_avg'] = test_x.groupby('company_name')[anchor1].transform(lambda x: x.rolling(rolling_n, 1).mean())
test_x[f'{anchor2}_moving_avg'] = test_x.groupby('company_name')[anchor2].transform(lambda x: x.rolling(rolling_n, 1).mean())
test_x[f'{anchor3}_moving_avg'] = test_x.groupby('company_name')[anchor3].transform(lambda x: x.rolling(rolling_n, 1).mean())
test_x[f'{anchor4}_moving_avg'] = test_x.groupby('company_name')[anchor4].transform(lambda x: x.rolling(rolling_n, 1).mean())
test_x.drop('company_name', axis=1, inplace=True)

test_x['status_label'] = test_x['status_label'].map({'alive': 1, 'failed': 0})

lgbm.fit(X, y)

y = pd.Series(lgbm.predict(test_x[X.columns])).map({0: 'Low Profit Margin', 1: 'Healthy Profit Margin', 2: 'High Profit Margin'})

prediction = pd.concat([id, y], axis=1).rename(columns={0: 'net_profit_margin_category'})

display(prediction)

prediction.to_csv('submit.csv', index=False)

In [None]:
s1 = pd.read_csv('submit.csv')
s2 = pd.read_csv('85.csv')

s1.compare(s2)

# Regression

In [None]:
import pandas as pd
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.feature_selection import *


df = pd.read_csv('company_train.csv').drop(["ID", "net_profit_margin_category"], axis=1)

df.info()

In [None]:
df['status_label'] = df['status_label'].map({'alive': 1, 'failed': 0})

In [None]:
anchor1 = "gross_profit"
anchor2 = "market_value"
anchor3 = "gross_profit"
anchor4 = "total_long_term_debt"

rolling_n = 3
df[f'{anchor1}_moving_avg'] = df.groupby('company_name')[anchor1].transform(lambda x: x.rolling(rolling_n, 1).mean())
df[f'{anchor2}_moving_avg'] = df.groupby('company_name')[anchor2].transform(lambda x: x.rolling(rolling_n, 1).mean())
df[f'{anchor3}_moving_avg'] = df.groupby('company_name')[anchor3].transform(lambda x: x.rolling(rolling_n, 1).mean())
df[f'{anchor4}_moving_avg'] = df.groupby('company_name')[anchor4].transform(lambda x: x.rolling(rolling_n, 1).mean())

df['net_sales'] = df['net_sales'].replace(0, 1e-6)
df['total_assets'] = df['total_assets'].replace(0, 1e-6)
df['total_assets_minus_liabilities'] = df['total_assets_minus_liabilities'] = df['total_assets'] - df['total_liabilities']
df['total_assets_minus_liabilities'] = df['total_assets_minus_liabilities'].replace(0, 1e-6)
df['total_receivables'] = df['total_receivables'].replace(0, 1e-6)

df['gross_profit_margin'] = df['gross_profit'] / df['net_sales']
df['ebitda_margin'] = df['ebitda'] / df['net_sales']
df['ebit_margin'] = df['ebit'] / df['net_sales']
df['asset_turnover'] = df['net_sales'] / df['total_assets']
df['debt_to_equity'] = df['total_liabilities'] / df['total_assets_minus_liabilities']
df['receivables_turnover'] = df['net_sales'] / df['total_receivables']
df['operating_expense_ratio'] = df['total_operating_expenses'] / df['net_sales']
df['depreciation_amortization_ratio'] = df['depreciation_and_amortization'] / df['total_assets']
df.drop(['total_assets_minus_liabilities'], axis=1)

df = pd.get_dummies(df, columns=['company_name'])

In [None]:
df

In [None]:
test = pd.read_csv('company_test_regress.csv')

test_x = test.drop(["ID"], axis=1)
id = test["ID"]

test_x[f'{anchor1}_moving_avg'] = test_x.groupby('company_name')[anchor1].transform(lambda x: x.rolling(rolling_n, 1).mean())
test_x[f'{anchor2}_moving_avg'] = test_x.groupby('company_name')[anchor2].transform(lambda x: x.rolling(rolling_n, 1).mean())
test_x[f'{anchor3}_moving_avg'] = test_x.groupby('company_name')[anchor3].transform(lambda x: x.rolling(rolling_n, 1).mean())
test_x[f'{anchor4}_moving_avg'] = test_x.groupby('company_name')[anchor4].transform(lambda x: x.rolling(rolling_n, 1).mean())
test_x['status_label'] = test_x['status_label'].map({'alive': 1, 'failed': 0})

test_x['net_sales'] = test_x['net_sales'].replace(0, 1e-6)
test_x['total_assets'] = test_x['total_assets'].replace(0, 1e-6)
test_x['total_assets_minus_liabilities'] = test_x['total_assets_minus_liabilities'] = test_x['total_assets'] - test_x['total_liabilities']
test_x['total_assets_minus_liabilities'] = test_x['total_assets_minus_liabilities'].replace(0, 1e-6)
test_x['total_receivables'] = test_x['total_receivables'].replace(0, 1e-6)

test_x['gross_profit_margin'] = test_x['gross_profit'] / test_x['net_sales']
test_x['ebitda_margin'] = test_x['ebitda'] / test_x['net_sales']
test_x['ebit_margin'] = test_x['ebit'] / test_x['net_sales']
test_x['asset_turnover'] = test_x['net_sales'] / test_x['total_assets']
test_x['debt_to_equity'] = test_x['total_liabilities'] / test_x['total_assets_minus_liabilities']

test_x['receivables_turnover'] = test_x['net_sales'] / test_x['total_receivables']
test_x['operating_expense_ratio'] = test_x['total_operating_expenses'] / test_x['net_sales']
test_x['depreciation_amortization_ratio'] = test_x['depreciation_and_amortization'] / test_x['total_assets']
test_x.drop(['total_assets_minus_liabilities'], axis=1)

test_x = pd.get_dummies(test_x, columns=['company_name'])

In [None]:
X = df.drop(["retained_earnings"], axis=1)
y = df["retained_earnings"]

In [None]:
X, test_x = X.align(test_x, join='left', axis=1, fill_value=0)

missing_cols = set(X.columns) - set(test_x.columns)
for col in missing_cols:
    test_x[col] = 0

test_x = test_x[X.columns]

display(X)
display(test_x)

In [None]:
rfr = ExtraTreesRegressor(n_jobs=-1, verbose=3)
rfr.fit(X, y)

In [None]:
y = pd.Series(rfr.predict(test_x[X.columns]), name='retained_earnings')

prediction = pd.concat([id, y], axis=1)

display(prediction)

prediction.to_csv('submit.csv', index=False)

In [None]:
s1 = pd.read_csv('submit.csv')
s2 = pd.read_csv('82.csv')

print(s1.compare(s2).to_string())

# Clustering

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from imblearn.under_sampling import *
from imblearn.over_sampling import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.feature_selection import *
import scipy.cluster.hierarchy as shc
import numpy as np


df = pd.read_csv('company_train.csv').drop(["ID", "retained_earnings"], axis=1)

df.info()

In [50]:
df['status_label'] = df['status_label'].map({'alive': 1, 'failed': 0})
df['net_profit_margin_category'] = df['net_profit_margin_category'].map({'Low Profit Margin': 0, 'Healthy Profit Margin': 1, 'High Profit Margin': 2})
df['company_name'] = df['company_name'].map({k: v for v, k in enumerate(df['company_name'].unique())})

In [51]:
from sklearn.cluster import AgglomerativeClustering

aggc = AgglomerativeClustering(linkage='single', compute_distances=True)

In [52]:
clusters = aggc.fit(df)

In [53]:
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count
    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    shc.dendrogram(linkage_matrix, **kwargs)

In [None]:
plt.figure(figsize=(10, 5))
plt.grid(False)
plot_dendrogram(aggc, truncate_mode='level', p=30)
plt.show()