In [1]:
import imblearn
from preprocessing import *
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [4]:
with open('X_train.pkl', 'rb') as f:
    X_train = pickle.load(f)
with open('X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)
with open('y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open('y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

In [2]:
X, y = preprocess_data('Modelar_UH2020.txt', process_cat = True)

El diccionario de categorías es {'RESIDENTIAL': 0, 'INDUSTRIAL': 1, 'OTHER': 2, 'RETAIL': 3, 'AGRICULTURE': 4, 'OFFICE': 5, 'PUBLIC': 6}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [3]:
y = y.astype('int')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify=y)

In [5]:
rf = RandomForestClassifier(n_estimators=500, n_jobs = -1, class_weight='balanced')

## Sin balancear

In [6]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [7]:
preds_nobalanced = rf.predict(X_test)

In [8]:
rf.score(X_test, y_test)

0.9111692337498789

In [9]:
f1_score(y_test, preds_nobalanced, average='macro')

0.484640739489507

In [21]:
#f1_score(y_test, preds_nobalanced, average='micro')

## BALANCEADO

In [10]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [11]:
from collections import Counter

In [12]:
counter = Counter(y_train)

In [13]:
counter

Counter({0: 72138, 5: 1463, 3: 1674, 6: 2381, 1: 3592, 2: 1066, 4: 270})

In [14]:
dic_classes = {6:35000, 4:6000, 3: 6000, 2: 6000, 1: 6000, 5: 6000, 0: 2000}
orig_dic = {6: 72138, 4: 1066, 3: 1674, 2: 3592, 1: 2381, 5: 1463, 0: 270}

In [15]:
np.sum([v for v in dic_classes.values()])

67000

In [16]:
np.sum([v for v in orig_dic.values()])

82584

In [17]:
82584 - 67000

15584

In [29]:
over = SMOTE(sampling_strategy={5:15000, 3: 15000, 6: 15000, 1: 15000, 2: 15000, 4: 15000})
under = RandomUnderSampler(sampling_strategy={0: 60000})
steps = [('o', over), ('u', under)]

In [30]:
pipeline = Pipeline(steps)

In [31]:
X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)

In [32]:
rf.fit(X_resampled, y_resampled)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [33]:
rf.score(X_test, y_test)

0.9080209241499564

In [34]:
preds_balanced = rf.predict(X_test)

In [35]:
f1_score(y_test, preds_balanced, average='macro')

0.5370755286450072

## OTHER METHODS

In [37]:
from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks

In [36]:
from imblearn.combine import SMOTEENN, SMOTETomek

In [38]:
resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')) #SMOTEENN(n_jobs=-1)

In [None]:
X_resampled, y_resampled = resample.fit_resample(X_train, y_train)

In [None]:
rf.fit(X_resampled, y_resampled)

In [None]:
rf.score(X_test, y_test)

In [None]:
preds_balanced = rf.predict(X_test)

In [None]:
f1_score(y_test, preds_balanced, average='macro')