In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
#Get CSV from Drive
!cp drive/MyDrive/machineLearning/Analizsiz_v2.0.csv ./

In [1]:
%%capture
!pip install prince
!pip install kmodes

In [7]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from math import log10, log2, log, sqrt, pow

from matplotlib import cm

from scipy.stats import mode

from sklearn.cluster import KMeans
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA, TruncatedSVD, FactorAnalysis, FastICA
from sklearn.linear_model import BayesianRidge, LinearRegression, SGDRegressor, ARDRegression, HuberRegressor, QuantileRegressor
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_samples, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import normalized_mutual_info_score, rand_score, adjusted_rand_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor

from xgboost import XGBRegressor

from kmodes.kmodes import KModes
from prince import FAMD, MCA

%matplotlib inline

In [35]:
dataset = pd.read_csv("./Analizsiz_v2.0.csv")

### Preparing dataset for use

In [36]:
def to_cats(dataframe, col_list):
  for col in col_list:
    dataframe[col] = dataframe[col].astype("category")
  return dataframe

In [37]:
def get_years(datetime_str):
  return int(datetime_str[:4])

In [38]:
dataset["creationyear"] = dataset.apply(lambda row: get_years(row["creationdate"]), axis=1)

In [39]:
cat_cols = ["classification", "positiondescription", "newpositiongroupcode",
            "companyId", "SalaryType", "mainsectorname", "workerType", "salarycount", "level",
            "getsBonus", "creationyear"]
dataset = to_cats(dataset, cat_cols)

In [40]:
dataset.head(10) #date -> sayısal, unix time, min bul gun farki

Unnamed: 0,classification,positiondescription,newpositiongroupcode,companyId,SalaryType,salarycount,creationdate,Price,mainsectorname,PriceYear,netsalary,netsalary2022,workerType,level,getsBonus,creationyear
0,Satış - Pazarlama,Satış Aplikasyon Sorumlusu,B2,315643,Net,12,2019-10-11 16:48:20.756,3200,Ticaret,38400,38400,67260,B,2,0,2019
1,Teknik Hizmetler,Dizayn Ofis Şefi,B3,298022,Net,12,2019-10-11 16:48:25.168,3500,Yapı,42000,42000,73565,B,3,0,2019
2,Müşteri Hizmetleri,Müşteri Temsilcisi,B1,279905,Net,12,2019-10-11 16:48:43.345,2020,Telekomünikasyon,24240,24240,42458,B,1,0,2019
3,Teknik Hizmetler,Teknik Koordinatör,B4,273851,Net,12,2019-10-11 16:48:44.696,5250,Hizmet,63000,63000,110348,B,4,0,2019
4,Bilgi Teknolojileri,Grafik Tasarım Sorumlusu,B2,266120,Net,12,2019-10-11 16:52:26.743,3500,Sigortacılık,42000,42000,73565,B,2,0,2019
5,Elektrik / Elektronik,Elektronik Bakım Onarım Elemanı,M1,279546,Net,12,2019-10-11 16:52:30.392,2550,Yapı,30600,30600,53597,M,1,0,2019
6,Otomotiv,Galeri Müdürü,B4,316837,Net,12,2019-10-11 16:54:43.727,5000,Gıda,60000,60000,105094,B,4,0,2019
7,Tekstil,Varyant Sorumlusu,M3,278283,Net,12,2019-10-11 16:54:58.345,2400,Tekstil,28800,28800,50445,M,3,0,2019
8,Eğitim,Matematik Öğretmeni,B3,260690,Net,12,2019-10-11 16:55:05.500,3000,Ticaret,36000,36000,63056,B,3,0,2019
9,Depolama / Dağıtım,Dağıtım Elemanı,M1,272313,Net,12,2019-10-11 16:55:13.888,2640,Hızlı Tüketim Malları,31680,31680,55489,M,1,0,2019


In [41]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441627 entries, 0 to 441626
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   classification        441627 non-null  category
 1   positiondescription   441627 non-null  category
 2   newpositiongroupcode  441627 non-null  category
 3   companyId             441627 non-null  category
 4   SalaryType            441627 non-null  category
 5   salarycount           441627 non-null  category
 6   creationdate          441627 non-null  object  
 7   Price                 441627 non-null  int64   
 8   mainsectorname        441627 non-null  category
 9   PriceYear             441627 non-null  int64   
 10  netsalary             441627 non-null  int64   
 11  netsalary2022         441627 non-null  int64   
 12  workerType            441627 non-null  category
 13  level                 441627 non-null  category
 14  getsBonus             441627 non-nul

In [42]:
dataset.isna().any()

classification          False
positiondescription     False
newpositiongroupcode    False
companyId               False
SalaryType              False
salarycount             False
creationdate            False
Price                   False
mainsectorname          False
PriceYear               False
netsalary               False
netsalary2022           False
workerType              False
level                   False
getsBonus               False
creationyear            False
dtype: bool

In [43]:
#len([val for val in dataset.companyId.value_counts() if val > 89])

In [44]:
def group_company(com_id, id_mask):
  if id_mask < 99:
    return -2
  else:
    return com_id

In [45]:
counts_col = dataset.groupby("companyId")["companyId"].transform(len)
mask = counts_col
dataset["mask"] = counts_col
dataset.companyId.value_counts()
dataset["companyId"] = dataset.apply(lambda row: group_company(row["companyId"], row["mask"]), axis=1)

In [46]:
#len([val for val in dataset.companyId.value_counts() if val > 100])

In [47]:
#len(dataset[dataset["mask"] > 70])

In [48]:
raw_dataset = dataset.copy()
dataset["netsalary2022_log"] = dataset.apply(lambda row: log10(row["netsalary2022"]), axis=1)

### Clustering

In [None]:
cluster_results = {}

In [None]:
clustering_cols = ["classification", "mainsectorname", "workerType", "level", "companyId"] #, "positiondescription"]
df_rest = dataset[clustering_cols]

df_rest["classification"] = df_rest["classification"].astype('string')
#df_rest["positiondescription"] = df_rest["positiondescription"].astype('string')
#df_rest["level"] = df_rest["level"].astype('string')
df_rest["mainsectorname"] = df_rest["mainsectorname"].astype('string')
df_rest["workerType"] = df_rest["workerType"].astype('string')
df_rest["companyId"] = df_rest["companyId"].astype('string')

In [52]:
print(dataset["mainsectorname"].nunique())
print(dataset["classification"].nunique())
print(dataset["positiondescription"].nunique())
print(dataset["companyId"].nunique())

41
53
9195
592


In [None]:
def make_clustering(data, method):
    kmodes_params = {
        "n_clusters": 6,
        "init": "Cao",
        "n_init": 15,
        "max_iter": 500
    }

    if method == "squeeze":
        data = np.array(data)

        #cluster = [2,3,4]
        #instance_id = 5
        thre = 2
        #print(similarity_instance_cluster(data,instance_id,cluster))
        print(squeezer(data,thre))

    

In [None]:
def kmodes_regular(data, params):
    if not params:
        params = {
            "n_clusters": 2
        }

    clustering_model = KModes(**params)
    labels = clustering_model.fit_predict(df_rest)

    return labels

In [53]:
"""
    This algorithm is an implementation of algorithms in the following paper:
    "Clustering Mixed Numeric and Categorical Data: A Cluster Ensemble 
    Approach"
"""

def get_support(data,feature_id,feature_val,cluster):
    """This function compute support for a given value
    """
    n_cluster_size = len(cluster)
    num = 0
    for j in range(n_cluster_size):
        if data[cluster[j],feature_id] == feature_val:
            num = num+1
    return num

def similarity_instance_cluster(data,instance_id,cluster):
    """This function computes the similarity between a new instance
    data[instance_id] and a cluster specified by cluster_id
    Parameters
    ----------
    data: array, shape(n_instances,n_features)
        matrix containing original data
    instance_id: int
        row number of the new instance
    cluster: list
        a list containing the ids of instances in this cluster
    
    Returns
    -------
    sim: float
        the similarity between the input instance and input cluster
    """
    n_instances,n_features = data.shape
    sim = 0.0

    for i in range(n_features):
        
        unique = []
        for j in range(len(cluster)):
            if data[cluster[j],i] not in unique:
                unique.append(data[cluster[j],i])
        temp = 0
        for j in range(len(unique)):
            temp = temp+get_support(data,i,unique[j],cluster)
        sim = sim+get_support(data,i,data[instance_id,i],cluster)*1.0/temp
    return sim

def squeezer(data,thre):
    """This function implements squeezer algorithm base on the paper "Squezzer
    : An Efficient Algorithm for Clustering Categorical Data"
    
    Parameters
    ----------
    data: array, shape(n_instances,n_features)
        the original data that need to be clustered, note that we donnot have
        to specify the number of clusters here
    thre: threshold used to decide if creating a new cluster is necessary
    Returns
    -------
    label: list, length(n_instances)
        label for every instance, label is a list of lists,list[i] represents
        cluster i, list[i] is a list containing the instances ID of cluster i
    """
    # Initialize the clustering result
    label = [[0]]
    
    # Obtain the number of instances and features from input data
    n_instances,n_features = data.shape

    for i in range(1,n_instances):

        # Current number of clusters
        n_clusters = len(label)
        sim = [0]*n_clusters
        # Compute similarity between data[i,:] and each cluster
        for j in range(n_clusters):
            sim[j] = similarity_instance_cluster(data,i,label[j])
        
        sim_max = max(sim)

        for j in range(n_clusters):
            if sim[j] == sim_max:
                sim_max_cluster_id = j

        if sim_max>=thre:
            label[sim_max_cluster_id].append(i)
        else:
            label.append([i])

    return label

def squeeze_wrapper(data, threshold):
    data = np.array(data)

    #cluster = [2,3,4]
    #instance_id = 5
    #print(similarity_instance_cluster(data,instance_id,cluster))
    
    labels = squeezer(data,threshold)

    return labels

In [None]:
results = squeeze_wrapper(sqz_data, thre)
np_res = np.array(results)

new_data.insert(0, 'numid', range(0, len(new_data)))

def squeeze_to_ids(df_id):
  clust_id = 0
  for cluster in results:
    for instance_id in cluster:
      if instance_id == df_id:
        return clust_id
    
    clust_id += 1
  
  return -1

new_data["cluster_id"] = new_data.apply(lambda row: squeeze_to_ids(row["numid"]), axis=1)

In [56]:
def squeezer_elbow(data, thresholds):
    results = {
        #"cost": [],
        "db": [],
        "ch": []
    }

    le = LabelEncoder()
    gatti = ["classification", "mainsectorname", "workerType", "companyId"] #, "positiondescription"]
    encoded_df = data.copy()

    for gatto in gatti:
        encoded_df[gatto] = le.fit_transform(encoded_df[gatto])

    scaler = MinMaxScaler()
    encoded_df = scaler.fit_transform(encoded_df)

    #K_lar = range(2, 50)  # 1 den 5 e, 6 dahil degil
    for i in thresholds:    # her bir k degeri icin
        # Modelimizi yaratalim
        labels = squeeze_wrapper(data, i)   
        
        #labels = model.fit_predict(data) 
        #results["cost"].append(model.cost_)  # her bir neticeyi bu listeye ekleyelim

        db_score = davies_bouldin_score(encoded_df, labels)
        results["db"].append(db_score)
        
        ch_score = calinski_harabasz_score(encoded_df, labels)
        results["ch"].append(ch_score)

        print(f"Results obtained for threshold: {i}")


    #plt.plot(k_vals, results["cost"], marker='o')
    #plt.xlabel('Küme sayısı (k)')
    #plt.ylabel('Costs')
    #plt.show()

    plt.plot(thresholds, results["db"], marker='o')
    plt.xlabel('Küme sayısı (k)')
    plt.ylabel('DB Scores')
    plt.show()

    plt.plot(thresholds, results["ch"], marker='o')
    plt.xlabel('Küme sayısı (k)')
    plt.ylabel('CH Scores')
    plt.show()

    return results

In [None]:
th_range = np.arange(1, 4, 1)
squeezer_elbow(df_rest, th_range)

In [33]:
len(np.arange(0.25, 5.25, 0.25))

array([0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  , 2.25, 2.5 , 2.75,
       3.  , 3.25, 3.5 , 3.75, 4.  , 4.25, 4.5 , 4.75, 5.  ])

In [None]:
def kmodes_elbow(data, k_vals):
    results = {
        "cost": [],
        "db": [],
        "ch": []
    }

    le = LabelEncoder()
    gatti = ["classification", "mainsectorname", "workerType", "companyId", "positiondescription"]
    encoded_df = data.copy()

    for gatto in gatti:
        encoded_df[gatto] = le.fit_transform(encoded_df[gatto])

    scaler = MinMaxScaler()
    encoded_df = scaler.fit_transform(encoded_df)

    #K_lar = range(2, 50)  # 1 den 5 e, 6 dahil degil
    for i in k_vals:    # her bir k degeri icin
        # Modelimizi yaratalim
        model = KModes(n_clusters=i,     # sirayla 1,2,3,4,5
                       init='Cao',   # algoritma secimi
                       n_init=15,
                       # verbose=1,
                       # n_jobs=-1,        #-1 hepsi
                       random_state=1,     # rastgele sayi ureteci
                       max_iter=500)   
        
        labels = model.fit_predict(data) 
        results["cost"].append(model.cost_)  # her bir neticeyi bu listeye ekleyelim

        db_score = davies_bouldin_score(encoded_df, labels)
        results["db"].append(db_score)
        
        ch_score = calinski_harabasz_score(encoded_df, labels)
        results["ch"].append(ch_score)


    plt.plot(k_vals, results["cost"], marker='o')
    plt.xlabel('Küme sayısı (k)')
    plt.ylabel('Costs')
    plt.show()

    plt.plot(k_vals, results["db"], marker='o')
    plt.xlabel('Küme sayısı (k)')
    plt.ylabel('DB Scores')
    plt.show()

    plt.plot(k_vals, results["ch"], marker='o')
    plt.xlabel('Küme sayısı (k)')
    plt.ylabel('CH Scores')
    plt.show()

    return results

In [None]:
kmodes_params = {
    "n_clusters": 6,
    "init": "Cao",
    "n_init": 15,
    "max_iter": 500
}

In [None]:
clustering_labels = kmodes_regular(df_rest, kmodes_params)

In [None]:
dataset["kmodes_clusters"] = clustering_labels

In [None]:
linreg_model = LinearRegression()

xgb_model = XGBRegressor(objective = 'reg:squarederror',
                     enable_categorical = True,
                     n_estimators=1000, 
                     max_depth=7, 
                     eta=0.3, #learning_rate = 0.1
                     min_child_weight = 6,
                     gamma = 0.1,
                     subsample = 0.65,
                     colsample_bytree=0.7,
                     reg_alpha = 0.01,
                     tree_method="gpu_hist",
                     verbose=True)

mlp_model = MLPRegressor(hidden_layer_sizes=(32, 32, 32),
                     #solver="sgd", learning_rate="adaptive",
                     solver="adam",
                     #learning_rate_init=0.0001,
                     learning_rate_init=0.0005,
                     activation="relu",
                     alpha=0.1,
                     #batch_size=128,
                     #random_state=42,
                     max_iter=40,
                     n_iter_no_change=5,
                     verbose=True)

models = {
    "xgboost": xgb_model,
    "mlp": mlp_model,
    "linreg": linreg_model
}

In [None]:
def pred_scores(predictions, y_real):
  res_list = []
  
  r2_val = r2_score(y_real, predictions)
  print(f"R-squared score: {round(r2_val, 6)}")
  res_list.append(r2_val)

  rmse = np.sqrt(mean_squared_error(y_real, predictions))
  print(f"RMSE: {round(rmse, 6)}")
  res_list.append(rmse)

  mae = mean_absolute_error(y_real, predictions)
  print(f"MAE: {round(mae, 6)}")
  res_list.append(mae)

  mape = mean_absolute_percentage_error(y_real, predictions) * 100
  print(f"MAPE: {round(mape, 6)} %")
  res_list.append(mape)

  return res_list

In [None]:
def test_models(models, data):
    all_results = []

    # Target - Predictor Variable Split
    #df = dataset.drop(dataset[dataset["kmodes_cluster"] == 1].index) #dataset[dataset["kmodes_cluster"] == 11]
    #df = df.drop(df[df["kmodes_cluster"] == 3].index)

    df = data
    df_target = df["netsalary2022_log"]
    #feature_cols = ["classification", "level", "mainsectorname", "companyId", "SalaryType", "workerType", "getsBonus"]
    feature_cols = ["classification", "level", "mainsectorname", "workerType", "companyId", "salarycount"]#, "SalaryType", "creationyear"]
    ordinal_cols = ["level", "salarycount"]#, "creationyear"]
    nominal_cols = ["classification", "mainsectorname", "workerType", "companyId"]#, "SalaryType"]
    df_rest = df[feature_cols]

    # OneHot and Ordinal Encoders
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
    ode = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    column_transformer = make_column_transformer((ohe, nominal_cols), (ode, ordinal_cols))

    # Train - Test Split
    X_train, X_test, Y_train, Y_test = train_test_split(df_rest, df_target, test_size = 0.2)#, random_state = 42)

    # MinMax Scaling for Target Variable on Training Data
    scaler = MinMaxScaler()
    scaler.fit(np.array(Y_train).reshape(-1, 1))
    temp_y = scaler.transform(np.array(Y_train).reshape(-1, 1))
    Y_train = pd.DataFrame(temp_y, columns=["netsalary2022_log"])

    for model_name, model in models.items():
        model_pipeline = make_pipeline(column_transformer, model)
        model_pipeline.fit(X_train, Y_train)

        predictions = model_pipeline.predict(X_test)
        predictions = scaler.inverse_transform(predictions.reshape(-1, 1)).flatten()
        Y_test = np.array(Y_test).flatten()

        print(f"Results for {model_name} :")
        results = pred_scores(predictions, Y_test)

        results.append(model_name)

        all_results.append(results)

    return all_results

In [None]:
cluster_res = []

for cluster_id in dataset["kmodes_clusters"].unique():
    new_res = [cluster_id]
    new_data = dataset[dataset["kmodes_clusters"] == cluster_id]

    test_results = test_models(models, new_data)

    for model in test_results:
        new_res.append(model[0])
        new_res.append(model[1])
        new_res.append(model[2])
        new_res.append(model[3])

    cluster_res.append(new_res)

Results for xgboost :
R-squared score: 0.396587
RMSE: 0.192835
MAE: 0.141252
MAPE: 2.806721 %


  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.01426399
Iteration 2, loss = 0.00731793
Iteration 3, loss = 0.00656702
Iteration 4, loss = 0.00630998
Iteration 5, loss = 0.00619419
Iteration 6, loss = 0.00612690
Iteration 7, loss = 0.00609711
Iteration 8, loss = 0.00608538
Iteration 9, loss = 0.00606895
Iteration 10, loss = 0.00606565
Iteration 11, loss = 0.00605945
Training loss did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Results for mlp :
R-squared score: 0.348936
RMSE: 0.200305
MAE: 0.151291
MAPE: 3.016218 %
Results for linreg :
R-squared score: -108357994137374.67
RMSE: 2584103.346599
MAE: 23992.255921
MAPE: 473403.729187 %
Results for xgboost :
R-squared score: 0.38431
RMSE: 0.17165
MAE: 0.113397
MAPE: 2.285145 %


  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.01880911
Iteration 2, loss = 0.00934210
Iteration 3, loss = 0.00777229
Iteration 4, loss = 0.00703683
Iteration 5, loss = 0.00660414
Iteration 6, loss = 0.00633572
Iteration 7, loss = 0.00617020
Iteration 8, loss = 0.00604840
Iteration 9, loss = 0.00598424
Iteration 10, loss = 0.00594002
Iteration 11, loss = 0.00590650
Iteration 12, loss = 0.00588691
Iteration 13, loss = 0.00587860
Iteration 14, loss = 0.00586263
Training loss did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Results for mlp :
R-squared score: 0.333583
RMSE: 0.178581
MAE: 0.12468
MAPE: 2.520999 %
Results for linreg :
R-squared score: 0.353559
RMSE: 0.175884
MAE: 0.117201
MAPE: 2.360524 %
Results for xgboost :
R-squared score: 0.350967
RMSE: 0.172457
MAE: 0.122103
MAPE: 2.457424 %


  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.10237396
Iteration 2, loss = 0.02388751
Iteration 3, loss = 0.01922557
Iteration 4, loss = 0.01681520
Iteration 5, loss = 0.01533119
Iteration 6, loss = 0.01433405
Iteration 7, loss = 0.01361386
Iteration 8, loss = 0.01307376
Iteration 9, loss = 0.01263860
Iteration 10, loss = 0.01227251
Iteration 11, loss = 0.01196407
Iteration 12, loss = 0.01168931
Iteration 13, loss = 0.01144269
Iteration 14, loss = 0.01121932
Iteration 15, loss = 0.01101424
Iteration 16, loss = 0.01084076
Iteration 17, loss = 0.01065501
Iteration 18, loss = 0.01048042
Iteration 19, loss = 0.01032161
Iteration 20, loss = 0.01017940
Iteration 21, loss = 0.01004175
Iteration 22, loss = 0.00992013
Iteration 23, loss = 0.00979170
Iteration 24, loss = 0.00968019
Iteration 25, loss = 0.00958910
Iteration 26, loss = 0.00946856
Iteration 27, loss = 0.00936808
Iteration 28, loss = 0.00931899
Iteration 29, loss = 0.00918694
Iteration 30, loss = 0.00911109
Iteration 31, loss = 0.00904112
Iteration 32, los

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.06095197
Iteration 2, loss = 0.02263089
Iteration 3, loss = 0.01820321
Iteration 4, loss = 0.01599523
Iteration 5, loss = 0.01470572
Iteration 6, loss = 0.01387301
Iteration 7, loss = 0.01331235
Iteration 8, loss = 0.01289737
Iteration 9, loss = 0.01257341
Iteration 10, loss = 0.01229306
Iteration 11, loss = 0.01205195
Iteration 12, loss = 0.01184969
Iteration 13, loss = 0.01167601
Iteration 14, loss = 0.01149982
Iteration 15, loss = 0.01135667
Iteration 16, loss = 0.01122806
Iteration 17, loss = 0.01107512
Iteration 18, loss = 0.01096930
Iteration 19, loss = 0.01083972
Iteration 20, loss = 0.01073989
Iteration 21, loss = 0.01065111
Iteration 22, loss = 0.01054917
Iteration 23, loss = 0.01048003
Iteration 24, loss = 0.01039222
Iteration 25, loss = 0.01030920
Iteration 26, loss = 0.01028593
Iteration 27, loss = 0.01018865
Iteration 28, loss = 0.01012585
Training loss did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Results for mlp :
R-squa

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.02301659
Iteration 2, loss = 0.01165511
Iteration 3, loss = 0.00907055
Iteration 4, loss = 0.00814877
Iteration 5, loss = 0.00776342
Iteration 6, loss = 0.00754566
Iteration 7, loss = 0.00740615
Iteration 8, loss = 0.00731414
Iteration 9, loss = 0.00725033
Iteration 10, loss = 0.00719056
Iteration 11, loss = 0.00715635
Iteration 12, loss = 0.00711641
Iteration 13, loss = 0.00710173
Training loss did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Results for mlp :
R-squared score: 0.338599
RMSE: 0.201417
MAE: 0.151503
MAPE: 3.011498 %
Results for linreg :
R-squared score: -1.793492278892873e+17
RMSE: 104884907.435603
MAE: 1395677.016403
MAPE: 26837678.928173 %
Results for xgboost :
R-squared score: 0.33258
RMSE: 0.170709
MAE: 0.128064
MAPE: 2.565305 %


  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.04107939
Iteration 2, loss = 0.02086694
Iteration 3, loss = 0.01712788
Iteration 4, loss = 0.01509694
Iteration 5, loss = 0.01372780
Iteration 6, loss = 0.01275062
Iteration 7, loss = 0.01201098
Iteration 8, loss = 0.01141944
Iteration 9, loss = 0.01094195
Iteration 10, loss = 0.01054111
Iteration 11, loss = 0.01019864
Iteration 12, loss = 0.00988920
Iteration 13, loss = 0.00962880
Iteration 14, loss = 0.00940330
Iteration 15, loss = 0.00917847
Iteration 16, loss = 0.00900971
Iteration 17, loss = 0.00884766
Iteration 18, loss = 0.00870033
Iteration 19, loss = 0.00856592
Iteration 20, loss = 0.00842863
Iteration 21, loss = 0.00831200
Iteration 22, loss = 0.00822775
Iteration 23, loss = 0.00812697
Iteration 24, loss = 0.00805567
Iteration 25, loss = 0.00798347
Iteration 26, loss = 0.00790371
Iteration 27, loss = 0.00783442
Iteration 28, loss = 0.00778910
Iteration 29, loss = 0.00773546
Training loss did not improve more than tol=0.000100 for 5 consecutive epochs. St

In [None]:
deneme_df = pd.DataFrame(cluster_res)

In [None]:
deneme_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0,0.396587,0.192835,0.141252,2.806721,0.348936,0.200305,0.151291,3.016218,-108358000000000.0,2584103.0,23992.26,473403.7
1,1,0.38431,0.17165,0.113397,2.285145,0.333583,0.178581,0.12468,2.520999,0.3535588,0.1758838,0.1172007,2.360524
2,4,0.350967,0.172457,0.122103,2.457424,0.333684,0.174738,0.122513,2.457297,-8.309217e+18,617060100.0,30667790.0,622947700.0
3,5,0.3004,0.206941,0.160378,3.170471,0.287432,0.20885,0.162439,3.200366,-53637970000000.0,1812001.0,103090.6,2055535.0
4,2,0.35891,0.1983,0.146371,2.906604,0.338599,0.201417,0.151503,3.011498,-1.793492e+17,104884900.0,1395677.0,26837680.0
5,3,0.33258,0.170709,0.128064,2.565305,0.323514,0.171864,0.129681,2.595944,-237704100000000.0,3221623.0,256456.6,5177972.0


In [None]:
test_results = test_models(models, dataset)

Results for xgboost :
R-squared score: 0.420888
RMSE: 0.18333
MAE: 0.132564
MAPE: 2.648618 %


  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.01195069
Iteration 2, loss = 0.00559699
Iteration 3, loss = 0.00477198
Iteration 4, loss = 0.00457227
Iteration 5, loss = 0.00452161
Iteration 6, loss = 0.00450946
Iteration 7, loss = 0.00449979
Iteration 8, loss = 0.00450051
Iteration 9, loss = 0.00449871
Iteration 10, loss = 0.00449879
Training loss did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.
Results for mlp :
R-squared score: 0.352136
RMSE: 0.193908
MAE: 0.143814
MAPE: 2.875904 %
Results for linreg :
R-squared score: 0.363216
RMSE: 0.192242
MAE: 0.141468
MAPE: 2.827154 %


In [None]:
test_results

[[0.4208879208984483,
  0.1833302687776724,
  0.1325639880869777,
  2.6486177382984053,
  'xgboost'],
 [0.3521355171059649,
  0.19390765275877606,
  0.1438140494589933,
  2.8759038376694934,
  'mlp'],
 [0.3632161970356137,
  0.1922422618332248,
  0.14146835991080942,
  2.827153649593235,
  'linreg']]