# Predictive Modeling

## Train Test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## SUPERVISED - Classification

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='newton-cg')
logreg.fit(X_train, y_train)

y_pred = logreg.predict_proba(X_test)[:,1]

AUC de Regresión Logística:  0.61


### kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

y_pred = knn.predict_proba(X_test)[:,1]

AUC de KNN:  0.56


### NAIVE BAYES

In [None]:
from sklearn.naive_bayes import BernoulliNB

naive_bayes = BernoulliNB()
naive_bayes.fit(X_train, y_train)

y_pred = naive_bayes.predict_proba(X_test)[:,1]

AUC de Naive Bayes:  0.57


### DECISION TREE

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

y_pred = tree.predict_proba(X_test)[:,1]

### RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

y_pred = random_forest.predict_proba(X_test)[:,1]

### XGBOOST

In [None]:
scores = []
n_estimators = np.linspace(1,5,5)
max_depths = [2,3,4]
learning_rates = [1,2,3]
for . in ... :
  xgb_classifier = XGBClassifier(n_estimators=n, max_depth=md, learning_rate=lr, objective='binary:logistic')
  score = xgb_classifier.fit(X_train, y_train).score(X_test, y_test)
  #y_pred = xgb_classifier.predict_proba(X_test)[:,1]
  scores.append(score)

### LIGHTGBM

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

scores = []
learning_rates =  np.linspace(0.1,1,9)
warm_starts = [True, False]

for . in ... :
  hist_gradient_boosting = HistGradientBoostingClassifier(learning_rate=lr, warm_start=ws)
  score = hist_gradient_boosting.fit(X_train, y_train).score(X_test, y_test)
  #y_pred = hist_gradient_boosting.predict_proba(X_test)[:,1]
  scores.append(score)

### SVM

In [None]:
from sklearn import svm

scores = []
kernels =  ['linear', 'poly', 'rbf', 'sigmoid']
Cs = np.linspace(0.1,0.9,9*10)
gammas = ['scale', 'auto']

for . in ... :
     score = svm.SVC(C=c, kernel=k, gamma=g).fit(X_train, y_train).score(X_test, y_test)
    scores.append(score)

## UNSUPERVISED - Clustering

### k-MEANS

Clusters data by trying to separate samples in _N_ groups of equal variance, minimizing the inertia (within-cluster sum-of-squares)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
kmeans.fit(X_train)

kmeans.labels_[:10]



array([4, 0, 3, 2, 2, 1, 2, 1, 2, 0])

### DBSCAN

Views clusters as areas of high density separated by areas of low density

In [None]:
from sklearn.cluster import DBSCAN


dbscan = DBSCAN(eps=3, min_samples=2)
dbscan.fit(X_train)

## SUPERVISED - Temporal

### ARIMA

In [None]:
from pmdarima import auto_arima


model = auto_arima(train_data, seasonal=False, suppress_warnings=True)
model_fit = model.fit(train_data)
predictions = model_fit.predict(n_periods=len(test_data))


# plt.figure(figsize=(12, 6))
# plt.plot(df_st[-12:].index.strftime('%YQ%q'), df_st[-12:].infl, label='Realidad')
# plt.plot(df_st[-4:].index.strftime('%YQ%q'), predictions, label='Predicción')
# plt.xticks(rotation=45, fontsize=8)

### EXPONENTIAL SMOOTHING

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

model = ExponentialSmoothing(train_data, seasonal=None, trend='add')
model_fit = model.fit()
predictions = model_fit.predict(start=len(train_data), end=len(train_data) + len(test_data) - 1)

plt.figure(figsize=(12, 6))
plt.plot(df_st[-12:].index.strftime('%YQ%q'), df_st[-12:].infl, label='Realidad')
plt.plot(df_st[-4:].index.strftime('%YQ%q'), predictions, label='Predicción')
plt.xticks(rotation=45, fontsize=8);

## PERFORMANCE METRICS

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y, clf.predict_proba(X)[:, 1]) #binary
roc_auc_score(y, clf.predict_proba(X), multi_class='ovr') #regression

### CROSS VALIDATION

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv=10)
print(round(scores.mean(),3), '+-', round(scores.std(),3))

### K-FOLD CV

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, classification_report

kf = KFold(n_splits=5)

accuracy = []
m = ['precision', 'recall', 'f1-score']
metrics = {str(output_values[0]): {'precision': [], 'recall': [], 'f1-score': []},
            str(output_values[1]): {'precision': [], 'recall': [], 'f1-score': []}}

for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = svm.SVC(C=0.75, kernel='rbf', gamma='scale')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)
    for me in m:
        metrics[str(output_values[0])][me].append(report['0'][me])
        metrics[str(output_values[1])][me].append(report['1'][me])
    accuracy.append(report['accuracy'])

print()
for me in m:
    print(f'{me.upper()} {str(output_values[0])} - {round(np.mean(metrics[str(output_values[0])][me]),3)} +- {round(np.std(metrics[str(output_values[0])][me]),3)}')
    print(f'{me.upper()} {str(output_values[1])} - {round(np.mean(metrics[str(output_values[1])][me]),3)} +- {round(np.std(metrics[str(output_values[1])][me]),3)}')
    print()
print(f"Accuracy: {round(np.mean(accuracy),3)} +- {round(np.std(accuracy),3)}")

### REPEATED K-FOLD CV

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import confusion_matrix, classification_report

rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=23)

accuracy = []
m = ['precision', 'recall', 'f1-score']
metrics = {str(output_values[0]): {'precision': [], 'recall': [], 'f1-score': []},
            str(output_values[1]): {'precision': [], 'recall': [], 'f1-score': []}}

for i, (train_index, test_index) in enumerate(rkf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = svm.SVC(C=0.75, kernel='rbf', gamma='scale')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)
    for me in m:
        metrics[str(output_values[0])][me].append(report['0'][me])
        metrics[str(output_values[1])][me].append(report['1'][me])
    accuracy.append(report['accuracy'])

print()
for me in m:
    print(f'{me.upper()} {str(output_values[0])} - {round(np.mean(metrics[str(output_values[0])][me]),3)} +- {round(np.std(metrics[str(output_values[0])][me]),3)}')
    print(f'{me.upper()} {str(output_values[1])} - {round(np.mean(metrics[str(output_values[1])][me]),3)} +- {round(np.std(metrics[str(output_values[1])][me]),3)}')
    print()
print(f"Accuracy: {round(np.mean(accuracy),3)} +- {round(np.std(accuracy),3)}")

### K-FOLD CV STRATIFIED

A variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

kf = StratifiedKFold(n_splits=4)

accuracy = []
m = ['precision', 'recall', 'f1-score']
metrics = {str(output_values[0]): {'precision': [], 'recall': [], 'f1-score': []},
            str(output_values[1]): {'precision': [], 'recall': [], 'f1-score': []}}

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = svm.SVC(C=0.75, kernel='rbf', gamma='scale')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)
    for me in m:
        metrics[str(output_values[0])][me].append(report['0'][me])
        metrics[str(output_values[1])][me].append(report['1'][me])
    accuracy.append(report['accuracy'])

print()
for me in m:
    print(f'{me.upper()} {str(output_values[0])} - {round(np.mean(metrics[str(output_values[0])][me]),3)} +- {round(np.std(metrics[str(output_values[0])][me]),3)}')
    print(f'{me.upper()} {str(output_values[1])} - {round(np.mean(metrics[str(output_values[1])][me]),3)} +- {round(np.std(metrics[str(output_values[1])][me]),3)}')
    print()
print(f"Accuracy: {round(np.mean(accuracy),3)} +- {round(np.std(accuracy),3)}")

### STRATIFIED SHUFFLE SPLIT

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report

sh = StratifiedShuffleSplit(n_splits=4, test_size=0.2)

accuracy = []
m = ['precision', 'recall', 'f1-score']
metrics = {'noKC': {'precision': [], 'recall': [], 'f1-score': []},
            'KC': {'precision': [], 'recall': [], 'f1-score': []}}

for i, (train_index, test_index) in enumerate(sh.split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = svm.SVC(C=0.75, kernel='rbf', gamma='scale')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)
    for me in m:
        metrics[str(output_values[0])][me].append(report['0'][me])
        metrics[str(output_values[1])][me].append(report['1'][me])
    accuracy.append(report['accuracy'])

print()
for me in m:
    print(f'{me.upper()} {str(output_values[0])} - {round(np.mean(metrics[str(output_values[0])][me]),3)} +- {round(np.std(metrics[str(output_values[0])][me]),3)}')
    print(f'{me.upper()} {str(output_values[1])} - {round(np.mean(metrics[str(output_values[1])][me]),3)} +- {round(np.std(metrics[str(output_values[1])][me]),3)}')
    print()
print(f"Accuracy: {round(np.mean(accuracy),3)} +- {round(np.std(accuracy),3)}")

### Validation Curve

In [None]:
from sklearn.model_selection import validation_curve
from sklearn.model_selection import ValidationCurveDisplay
from sklearn.utils import shuffle

train_scores, valid_scores = validation_curve(
    SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 3),
)

from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)

ValidationCurveDisplay.from_estimator(SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 10))

### Learning Curve

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import LearningCurveDisplay

train_sizes, train_scores, valid_scores = learning_curve(
    SVC(kernel='linear'), X, y, train_sizes=[50, 80, 110], cv=5)

print(train_sizes)
print(train_scores)
print(valid_scores)

from sklearn.utils import shuffle
X, y = shuffle(X, y, random_state=0)

LearningCurveDisplay.from_estimator(
   SVC(kernel="linear"), X, y, train_sizes=[50, 80, 110], cv=5)