In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier

In [2]:
def scale(x_train, x_test):
    scaler = StandardScaler()
    scaler.fit(x_train)

    x_train = pd.DataFrame(scaler.transform(x_train), 
                           index=x_train.index, 
                           columns=x_train.columns)

    x_test = pd.DataFrame(scaler.transform(x_test),
                          index=x_test.index, 
                          columns=x_test.columns)
    return x_train, x_test

In [3]:
dummy_clf = DummyClassifier(strategy="uniform")

## 1. 5-classes classification:

### 1.1. Original set

In [4]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc.csv', index_col='building_id')

In [5]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]

In [6]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [7]:
dummy_clf.fit(x_train_norm, y_train.values.ravel())

DummyClassifier(strategy='uniform')

In [8]:
prediction = dummy_clf.predict(x_test_norm)
pred_proba = dummy_clf.predict_proba(x_test_norm)

In [9]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_1,0.2,0.2,0.2,0.2,0.2
1,Grade_1,0.2,0.2,0.2,0.2,0.2
2,Grade_2,0.2,0.2,0.2,0.2,0.2
3,Grade_2,0.2,0.2,0.2,0.2,0.2
4,Grade_5,0.2,0.2,0.2,0.2,0.2


In [10]:
preds.to_csv('../Project_data/results/dummy_preds_orig.csv')

### 1.2 Resampled set:

#### 1.2.1 Over-sampling with SMOTENC

In [11]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc.csv', index_col=0).reset_index(drop=True)

  mask |= (ar1 == a)


In [12]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [13]:
dummy_clf.fit(x_train_norm, y_train.values.ravel())

DummyClassifier(strategy='uniform')

In [14]:
prediction = dummy_clf.predict(x_test_norm)
pred_proba = dummy_clf.predict_proba(x_test_norm)

In [15]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_2,0.2,0.2,0.2,0.2,0.2
1,Grade_1,0.2,0.2,0.2,0.2,0.2
2,Grade_2,0.2,0.2,0.2,0.2,0.2
3,Grade_2,0.2,0.2,0.2,0.2,0.2
4,Grade_4,0.2,0.2,0.2,0.2,0.2


In [16]:
preds.to_csv('../Project_data/results/dummy_preds_smotenc.csv')

### 1.2.2 Under-sampling: cleaning oversampled dataset

#### 1.2.2.1 Tomek

In [17]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_tmk.csv', index_col=0).reset_index(drop=True)

In [18]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [19]:
dummy_clf.fit(x_train_norm, y_train.values.ravel())

DummyClassifier(strategy='uniform')

In [20]:
prediction = dummy_clf.predict(x_test_norm)
pred_proba = dummy_clf.predict_proba(x_test_norm)

In [21]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_3,0.2,0.2,0.2,0.2,0.2
1,Grade_1,0.2,0.2,0.2,0.2,0.2
2,Grade_3,0.2,0.2,0.2,0.2,0.2
3,Grade_2,0.2,0.2,0.2,0.2,0.2
4,Grade_3,0.2,0.2,0.2,0.2,0.2


In [22]:
preds.to_csv('../Project_data/results/dummy_preds_smotenc_tmk.csv')

#### 1.2.2.2 ENN

In [23]:
x_train = pd.read_csv('../Project_data/processed_data/resampling/x_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)
y_train = pd.read_csv('../Project_data/processed_data/resampling/y_train_smotenc_enn.csv', index_col=0).reset_index(drop=True)

In [24]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [25]:
dummy_clf.fit(x_train_norm, y_train.values.ravel())

DummyClassifier(strategy='uniform')

In [26]:
prediction = dummy_clf.predict(x_test_norm)
pred_proba = dummy_clf.predict_proba(x_test_norm)

In [27]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3,4,5
0,Grade_1,0.2,0.2,0.2,0.2,0.2
1,Grade_5,0.2,0.2,0.2,0.2,0.2
2,Grade_5,0.2,0.2,0.2,0.2,0.2
3,Grade_1,0.2,0.2,0.2,0.2,0.2
4,Grade_1,0.2,0.2,0.2,0.2,0.2


In [28]:
preds.to_csv('../Project_data/results/dummy_preds_smotenc_enn.csv')

## 2. 3-classes classification:

In [29]:
x = pd.read_csv('../Project_data/processed_data/x_post_preproc.csv', index_col='building_id')
y = pd.read_csv('../Project_data/processed_data/y_post_preproc_3lab.csv', index_col='building_id')

In [30]:
x_train, x_test = train_test_split(x, test_size=0.2, random_state=42)
y_train, y_test = y.loc[x_train.index], y.loc[x_test.index]

In [31]:
x_train_norm, x_test_norm = scale(x_train, x_test)

In [32]:
dummy_clf.fit(x_train_norm, y_train.values.ravel())

DummyClassifier(strategy='uniform')

In [33]:
prediction = dummy_clf.predict(x_test_norm)
pred_proba = dummy_clf.predict_proba(x_test_norm)

In [34]:
preds = pd.DataFrame(np.hstack([np.reshape(prediction,(-1,1)), pred_proba]))
preds.head()

Unnamed: 0,0,1,2,3
0,G3,0.333333,0.333333,0.333333
1,G2,0.333333,0.333333,0.333333
2,G2,0.333333,0.333333,0.333333
3,G2,0.333333,0.333333,0.333333
4,G3,0.333333,0.333333,0.333333


In [35]:
preds.to_csv('../Project_data/results/dummy_preds_3cls.csv')