In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
def read_csv(path_to_csv):
    df = pd.read_csv(path_to_csv)
    data = df.drop(columns=['ID_code'])
    target = data['target']
    data.drop(columns=['target'], inplace=True)
    return data, target

---
# Original dataset

In [3]:
data, target = read_csv("train.csv")
training_features, testing_features, training_target, testing_target = \
    train_test_split(data, target, train_size=0.66, test_size=0.33, random_state=42)

## Pipeline 1

In [4]:
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount

# Average CV score on the training set was:0.920530303030303
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=GaussianNB()),
    VarianceThreshold(threshold=0.001),
    ZeroCount(),
    ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=0.15000000000000002, min_samples_leaf=6, min_samples_split=2, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print("Precision", precision_score(testing_target, results))
print("Recall", recall_score(testing_target, results))
print("F1 score", f1_score(testing_target, results))



Precision 0.7212261422787739
Recall 0.3692626591649393
F1 score 0.48844496670583626


## Pipeline 2

In [5]:
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline

# Average CV score on the training set was:0.9214318207197557
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.01),
    GaussianNB()
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print("Precision", precision_score(testing_target, results))
print("Recall", recall_score(testing_target, results))
print("F1 score", f1_score(testing_target, results))

Precision 0.713388524122181
Recall 0.37000296120817294
F1 score 0.48727698157355953


---
# Balanced with Random Under Sampler

In [6]:
data, target = read_csv("random_under_sampler.csv")
training_features, testing_features, training_target, testing_target = \
    train_test_split(data, target, train_size=0.66, test_size=0.33, random_state=42)

## Pipeline 1

In [7]:
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline

# Average CV score on the training set was:0.8072297930285739
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.01),
    GaussianNB()
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print("Precision", precision_score(testing_target, results))
print("Recall", recall_score(testing_target, results))
print("F1 score", f1_score(testing_target, results))

Precision 0.8093065693430657
Recall 0.8031995170540296
F1 score 0.8062414785638539


## Pipeline 2

In [8]:
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline

# Average CV score on the training set was:0.8085111309799645
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.0001),
    GaussianNB()
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print("Precision", precision_score(testing_target, results))
print("Recall", recall_score(testing_target, results))
print("F1 score", f1_score(testing_target, results))

Precision 0.8093065693430657
Recall 0.8031995170540296
F1 score 0.8062414785638539


---
# Balanced with Random Over Sampler

In [9]:
data, target = read_csv("random_over_sampler.csv")
training_features, testing_features, training_target, testing_target = \
    train_test_split(data, target, train_size=0.66, test_size=0.33, random_state=42)

In [10]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

# Average CV score on the training set was:0.8292963321682738
exported_pipeline = GradientBoostingClassifier(learning_rate=0.5, max_depth=4, max_features=0.05, min_samples_leaf=8, min_samples_split=12, n_estimators=100, subsample=0.9500000000000001)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

print("Precision", precision_score(testing_target, results))
print("Recall", recall_score(testing_target, results))
print("F1 score", f1_score(testing_target, results))

Precision 0.8311350548021549
Recall 0.8290907865698546
F1 score 0.8301116621124717


---
# Feature engineering dataset

(En notebook separado)