### Textual

In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn import model_selection
from sklearn.model_selection import cross_validate

### Read in Text Data

In [2]:
# Read Training data
textual_df = pd.read_csv("../../Results/textual.csv", sep=",")

In [3]:
# Features
metadata_x = textual_df.drop("goodforairplane", axis=1)
# Target
metadata_Y = textual_df["goodforairplane"]

In [4]:
np.random.seed(50)

def LVW(meta_x, meta_Y, K, classifier):
    err = 0
    k = 0
    C = 100
    # Best features
    S = np.array([])
    # Create list of features
    while k < K:
        # Create a list of available features
        features_ind = np.array(list(range(0, len(meta_x.columns))))
        num_features = len(features_ind)
        # Generate how many features should be selected
        C1 = np.random.randint(1,num_features+1)
        # Randomly pick num_selected_features as subset
        S1_ind = np.random.choice(features_ind, size=C1, replace = False)
        # Sort it, because it looks nicer...
        S1_ind = np.sort(S1_ind)
        # Train the classifier
        classifier.fit(meta_x.iloc[:,S1_ind], meta_Y)
        # Predict the results
        pred = classifier.predict(meta_x.iloc[:,S1_ind])
        # Comptue f1 measure (since in the paper it is stated that they tune it based on the f1 score)
        f1 = f1_score(meta_Y, pred)
        
        if (f1 > err or (f1 == err and C1 < C)):
            k = 0
            S = S1_ind
            err = f1
            C = C1
        else:
            k = k+1
    # Test on testset
    classifier.fit(meta_x.iloc[:,S], meta_Y)
    #test_pred = classifier.predict()
    
    # Get column names
    columns = meta_x.iloc[:,S].columns
    return columns

In [8]:
np.random.seed(50)
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Define Scoring methods
scoring = {'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

classifiers = [KNeighborsClassifier(),
               SVC(), GaussianNB()]

for classi in classifiers:
    # Perform LVW (not for random forest)
    if isinstance(classi, RandomForestClassifier):
        #ind = LVW(metadata_x, metadata_Y, 10, classi)
        scores = cross_validate(classi, X=metadata_x, y=metadata_Y, cv=10,scoring=scoring)
        f_1 = np.mean(scores["test_f1_score"])
        precision = np.mean(scores["test_precision"])
        recall = np.mean(scores["test_recall"])
    else:
        ind = LVW(metadata_x, metadata_Y, 10, classi)
        #scores = cross_validate(classi, X=metadata_x[ind], y=metadata_Y, cv=10,scoring=scoring)
        scores = perform_cross_validation(metadata_x[ind], metadata_Y, classi)
        #f_1 = np.mean(scores["test_f1_score"])
        #precision = np.mean(scores["test_precision"])
        #recall = np.mean(scores["test_recall"])
        precision = scores[0]
        recall = scores[1]
        f_1 = scores[2]
    print("Classifier: %s, Modality: metadata, Precision: %.3f, Recall: %.3f, F1: %.3f" % (classi.__class__.__name__, precision, recall, f_1))

Classifier: KNeighborsClassifier, Modality: metadata, Precision: 0.604, Recall: 0.615, F1: 0.610
Classifier: SVC, Modality: metadata, Precision: 0.547, Recall: 1.000, F1: 0.707
Classifier: GaussianNB, Modality: metadata, Precision: 0.583, Recall: 0.404, F1: 0.477


In [14]:
np.random.seed(50)
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Define Scoring methods
scoring = {'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

classifiers = [KNeighborsClassifier(),
               SVC(), GaussianNB(), RandomForestClassifier()]

for classi in classifiers:
    # Perform LVW (not for random forest)
    if isinstance(classi, RandomForestClassifier):
        #ind = LVW(metadata_x, metadata_Y, 10, classi)
        #scores = cross_validate(classi, X=metadata_x[ind], y=metadata_Y, cv=10,scoring=scoring)
        scores = perform_cross_validation(metadata_x, metadata_Y, classi)
        #f_1 = np.mean(scores["test_f1_score"])
        #precision = np.mean(scores["test_precision"])
        #recall = np.mean(scores["test_recall"])
        precision = scores[0]
        recall = scores[1]
        f_1 = scores[2]
    else:
        ind = LVW(metadata_x, metadata_Y, 10, classi)
        #scores = cross_validate(classi, X=metadata_x[ind], y=metadata_Y, cv=10,scoring=scoring)
        scores = perform_cross_validation(metadata_x[ind], metadata_Y, classi)
        #f_1 = np.mean(scores["test_f1_score"])
        #precision = np.mean(scores["test_precision"])
        #recall = np.mean(scores["test_recall"])
        precision = scores[0]
        recall = scores[1]
        f_1 = scores[2]
    print("Classifier: %s, Modality: metadata, Precision: %.3f, Recall: %.3f, F1: %.3f" % (classi.__class__.__name__, precision, recall, f_1))

Classifier: KNeighborsClassifier, Modality: metadata, Precision: 0.604, Recall: 0.615, F1: 0.610
Classifier: SVC, Modality: metadata, Precision: 0.547, Recall: 1.000, F1: 0.707
Classifier: GaussianNB, Modality: metadata, Precision: 0.583, Recall: 0.404, F1: 0.477
Classifier: RandomForestClassifier, Modality: metadata, Precision: 0.442, Recall: 0.442, F1: 0.442


In [5]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def perform_cross_validation(X, y, classi):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    predictions = []
    ground_truth = []
    for train_idx, test_idx in kf.split(X, y):
        X_train = X.loc[train_idx]
        X_test = X.loc[test_idx]
        y_train = y.loc[train_idx]
        y_test = y.loc[test_idx]
        # Perform Las Vegas Wrapper
        ind = LVW(X_train, y_train, 10, classi)
        
        classi.fit(X_train[ind], y_train)
        # Predict
        fold_predictions = classi.predict(X_test[ind])
        ground_truth.extend(y_test)
        predictions.extend(fold_predictions)
    
    # Compute the metrics
    precision = precision_score(ground_truth, predictions)
    recall = recall_score(ground_truth, predictions)
    f1 = f1_score(ground_truth, predictions)
    return precision, recall, f1

#perform_cross_validation(metadata_x, metadata_Y )
#kf = KFold(n_splits=10, shuffle=True, random_state=42)
#for train_idx, test_idx in kf.split(metadata_x, metadata_Y):
        #classi.fit(X[train_idx], y[train_idx])
        #print(metadata_x[train_idx])
        # Predict
        #fold_predictions = classi.predict(X[test_idx])
#        print("Hello")
#metadata_x.iloc[3, [1,2,3]]
#metadata_x.loc[3]