In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [3]:
names = [
    "DA_TY_1", "DA_TY_2", "DA_TY_3", "DA_TY_4",    
    "DA_LY_1", "DA_LY_2", "DA_LY_3", "DA_LY_4",    
    "DA_T3_TMIN"  , "DA_T3_TMEAN",
    "DA_T3_BENDS" , "DA_T3_FIN",
    "DA_T12_TMIN" , "DA_T12_TMEAN",
    "DA_T12_BENDS", "DA_T12_FIN",
    "DB_TY_1", "DB_TY_2", "DB_TY_3", "DB_TY_4",    
    "DB_LY_1", "DB_LY_2", "DB_LY_3", "DB_LY_4",    
    "DB_T3_TMIN"  , "DB_T3_TMEAN",
    "DB_T3_BENDS" , "DB_T3_FIN",
    "DB_T12_TMIN" , "DB_T12_TMEAN",
    "DB_T12_BENDS", "DB_T12_FIN",
    "A_POS", "B_POS",
    "RES"
]

In [4]:
features = [
    "TY_1", "TY_4",    
    "LY_1", "LY_4",    
    "T3_TMIN"  , "T3_TMEAN",
    "T3_BENDS" , "T3_FIN",
    "T12_TMIN" , "T12_TMEAN",
    "T12_BENDS", "T12_FIN"
]

In [5]:
def normalize(s):
    if s[0] == 0 or s[0] == s[1]:
        return 0.0
    else:
        return s[0] / (s[0] + s[1])

In [7]:
def weight(data):
    data = np.array(data)
    return np.log(data[1]/data[0]) ** 2

In [8]:
df["W"] = df[["A_POS", "B_POS"]].apply(weight, axis=1)

In [9]:
X = pd.DataFrame(columns=features)
for x_col in features:
    X[x_col] = df[["DA_"+x_col, "DB_"+x_col]].apply(normalize, axis=1)

In [10]:
from sklearn.preprocessing import MaxAbsScaler

In [11]:
scaler = MaxAbsScaler()
x_scaled = scaler.fit_transform(X)

In [12]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, df["RES"], test_size=0.33, random_state=42)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [12]:
rfc = RandomForestClassifier(n_estimators=50, min_samples_leaf=100)
gbc = GradientBoostingClassifier(n_estimators=50, min_samples_leaf=20)
svc = SVC(C=100, gamma=1e-7)
mlp = MLPClassifier(hidden_layer_sizes=(200,), max_iter=500)
knn = KNeighborsClassifier(n_neighbors=20)

679     3.210402
3406    3.210402
4067    1.921812
1211    3.210402
4813    1.921812
4484    0.480453
567     0.480453
1873    1.206949
2620    1.206949
1717    0.839589
1392    0.480453
4709    1.921812
2252    3.210402
489     3.210402
4595    1.206949
690     3.210402
3324    0.260943
602     0.839589
4061    1.206949
3932    0.049793
2464    1.206949
4441    0.480453
1187    0.164402
274     2.590290
3500    3.210402
1361    1.206949
3015    2.590290
1526    3.210402
5323    1.206949
3399    0.480453
          ...   
2905    0.164402
4844    0.480453
4118    2.590290
3386    1.921812
4556    3.210402
1184    3.210402
5052    0.480453
5313    1.206949
2434    1.921812
5613    1.921812
2391    1.921812
769     2.590290
1685    0.164402
130     1.206949
2920    2.590290
3172    2.590290
3445    3.210402
6233    2.590290
5580    3.210402
4427    1.206949
5336    1.921812
466     1.921812
6267    3.210402
5736    3.210402
3093    2.590290
3773    0.480453
5193    3.210402
5228    1.9218

Int64Index([ 679, 3406, 4067, 1211, 4813, 4484,  567, 1873, 2620, 1717,
            ...
            5336,  466, 6267, 5736, 3093, 3773, 5193, 5228, 5392,  860],
           dtype='int64', length=4213)

In [13]:
rfc.fit(X_train, y_train, sample_weight=df["W"].iloc[X_train.index])
gbc.fit(X_train, y_train, sample_weight=df["W"].iloc[X_train.index])
svc.fit(X_train, y_train, sample_weight=df["W"].iloc[X_train.index])

IndexError: positional indexers are out-of-bounds

In [35]:
rfc_pred = rfc.predict_proba(X_test)
gbc_pred = gbc.predict_proba(X_test)

In [57]:
"%0.2f" % ((np.average([rfc_pred[0][0], gbc_pred[0][0]])) * 100)

'27.10'

In [47]:
np.average([rfc_pred[0][1], gbc_pred[0][1]])

0.7290218942412401

In [32]:
print(classification_report(y_test, gbc_pred))

              precision    recall  f1-score   support

           A       0.79      0.91      0.84       533
           B       0.89      0.75      0.81       523

   micro avg       0.83      0.83      0.83      1056
   macro avg       0.84      0.83      0.83      1056
weighted avg       0.84      0.83      0.83      1056



In [33]:
print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

           A       0.73      0.89      0.80       533
           B       0.86      0.66      0.75       523

   micro avg       0.78      0.78      0.78      1056
   macro avg       0.80      0.78      0.78      1056
weighted avg       0.80      0.78      0.78      1056



In [34]:
print(classification_report(y_test, svc_pred))

              precision    recall  f1-score   support

           A       0.50      1.00      0.67       533
           B       0.00      0.00      0.00       523

   micro avg       0.50      0.50      0.50      1056
   macro avg       0.25      0.50      0.34      1056
weighted avg       0.25      0.50      0.34      1056



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
