<a href="https://colab.research.google.com/github/biozid-0208/daily-random-programming-commit/blob/main/dfnc1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.linear_model
import sklearn.ensemble
import sklearn.metrics
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [None]:
def split_x_and_y(X, y, test_size = 0.2, random_state = 42):
    # % of the sample size
    train_size=int(len(X)*test_size)

    #Make our results reproducible
    np.random.seed(random_state)

    #Select randomly the rows for the training dataset
    rows_array=np.random.choice(len(X),size=train_size,replace=False)

    #Create x,y train datasets
    X_train=X.iloc[rows_array]
    y_train=y.iloc[rows_array]

    #Select the rest arrays for the test dataset
    total_rows=np.arange(len(X))
    test_arrays=np.delete(total_rows,rows_array)

    #Create x,y test datasets
    X_test=X.iloc[test_arrays]
    y_test=y.iloc[test_arrays]

    return(X_train,y_train,X_test,y_test)

In [None]:
def specify_models():

    knear={'name':'K Nearest Neighbors Classifier',
           'class':sklearn.neighbors.KNeighborsClassifier(),
            'parameters':{'n_neighbors':range(1,12)}
          }

    svc_linear={'name':'Support Vector Classifier with Linear Kernel',
               'class':sklearn.svm.LinearSVC(),
                'parameters':{'C':[0.001,0.01,0.1,1,10,100]}
          }

    sv_radial={'name':'Support Vector Classifier with Radial Kernel',
               'class':sklearn.svm.SVC(kernel='rbf'),
                'parameters':{'C':[0.001,0.01,0.1,1,10,100],'gamma':[0.001,0.01,0.1,1,10,100]}
          }

    loglas={'name':"Logistic Regression with LASSO",
             'class':sklearn.linear_model.LogisticRegression(penalty='l2'),
             'parameters':{'C':[0.001,0.01,0.1,1,10,100]}
            }

    sgdc={'name':"Stochastic Gradient Descent Classifier",
            'class':sklearn.linear_model.SGDClassifier(),
            'parameters':{'max_iter':[100,1000],'alpha':[0.0001,0.001,0.01,0.1]}
            }

    decis_tree={'name':"Decision Tree Classifier",
            'class':sklearn.tree.DecisionTreeClassifier(),
            'parameters':{'max_depth':range(3,15)}
            }

    ranfor={'name':"Random Forest Classifier",
            'class':sklearn.ensemble.RandomForestClassifier(),
            'parameters':{'n_estimators':[10,20,50,100,200]}
            }

    extrerantree={'name':"Extremely Randomized Trees Classifier",
                    'class':sklearn.ensemble.ExtraTreesClassifier(),
                    'parameters':{'n_estimators':[10,20,50,100,200]}
                 }


    lis=list([knear,svc_linear,sv_radial,loglas,sgdc,decis_tree,ranfor,extrerantree])

    return(lis)

In [None]:
def train_model(model_dict, X, y, metric = 'f1', k = 5):
    name=model_dict['name']
    param_grid = model_dict['parameters']
    clf=GridSearchCV(estimator=model_dict['class'], param_grid=param_grid, cv= k, scoring=metric)
    best_score= clf.fit(X,y).best_score_
    best_model= clf
    return(name, best_model, best_score)

In [None]:
def train_all_models(models, X, y, metric ='roc_auc', k = 5):
    #Initialize the list
    final_list=list()

    for i in range(0,len(models)):
        tr_model=train_model(models[i] ,X ,y , metric = metric, k=k)
        final_list.append(tr_model)

    #Sort the final list
    final_list=sorted(final_list, key=lambda score: score[2], reverse=True)
    return(final_list)

In [None]:
def auto_train_binary_classifier(X,y, models, test_size = 0.2, random_state = 42,
                                 metric = 'roc_auc', k = 5):



    #Use the second function to split the dataframe to training and test
    split_df=split_x_and_y(X, y,
                           test_size = test_size,
                           random_state = random_state
                          )

    #Train all the models
    final_model=train_all_models(models, split_df[0],split_df[1], metric = metric, k = k)




    #Take the best model, it's name and the score
    best_model_name=final_model[1][0]
    best_model=final_model[1][1]
    train_set_score=final_model[1][2]

    ##################################
    # Test set performance
    ##################################

    predicted=final_model[1][1].predict(split_df[2])
    test_set_score=sklearn.metrics.accuracy_score(split_df[3], predicted)

    return(best_model_name, best_model, train_set_score, test_set_score)

In [None]:
import scipy.io
import numpy as np
import pandas as pd

mat = scipy.io.loadmat('/content/_cluster_3_data.mat')
data = pd.DataFrame(mat['result_data'])
info = pd.DataFrame(mat['result_demographic'][:,-1])
models = specify_models()
best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(data,info, models)


In [None]:
print(best_model_name)
print(best_model)
print(train_set_score)
print(test_set_score)

Extremely Randomized Trees Classifier
GridSearchCV(cv=5, estimator=ExtraTreesClassifier(),
             param_grid={'n_estimators': [10, 20, 50, 100, 200]},
             scoring='roc_auc')
0.7383333333333334
0.6102941176470589


In [None]:
mat = scipy.io.loadmat('/content/_cluster_2_data.mat')
data = pd.DataFrame(mat['result_data'])
info = pd.DataFrame(mat['result_demographic'][:,-1])
models = specify_models()
best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(data,info, models)


In [None]:
print(best_model_name)
print(best_model)
print(train_set_score)
print(test_set_score)

Extremely Randomized Trees Classifier
GridSearchCV(cv=5, estimator=ExtraTreesClassifier(),
             param_grid={'n_estimators': [10, 20, 50, 100, 200]},
             scoring='roc_auc')
0.7
0.4880952380952381


In [None]:
mat = scipy.io.loadmat('/content/_cluster_1_data.mat')
data = pd.DataFrame(mat['result_data'])
info = pd.DataFrame(mat['result_demographic'][:,-1])
models = specify_models()
best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(data,info, models)


In [None]:
print(best_model_name)
print(best_model)
print(train_set_score)
print(test_set_score)

Support Vector Classifier with Linear Kernel
GridSearchCV(cv=5, estimator=LinearSVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
             scoring='roc_auc')
nan
0.6875


In [None]:
mat = scipy.io.loadmat('/content/cluster_1_data.mat')
data = pd.DataFrame(mat['result_data'])
info = pd.DataFrame(mat['result_demographic'][:,-1])
models = specify_models()
best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(data,info, models)


In [None]:
print(best_model_name)
print(best_model)
print(train_set_score)
print(test_set_score)

Support Vector Classifier with Linear Kernel
GridSearchCV(cv=5, estimator=LinearSVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
             scoring='roc_auc')
nan
0.7333333333333333


In [None]:
mat = scipy.io.loadmat('/content/cluster_2_data.mat')
data = pd.DataFrame(mat['result_data'])
info = pd.DataFrame(mat['result_demographic'][:,-1])
models = specify_models()
best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(data,info, models)


In [None]:
print(best_model_name)
print(best_model)
print(train_set_score)
print(test_set_score)

Logistic Regression with LASSO
GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
             scoring='roc_auc')
0.85
0.5135135135135135


In [None]:
mat = scipy.io.loadmat('/content/cluster_3_data.mat')
data = pd.DataFrame(mat['result_data'])
info = pd.DataFrame(mat['result_demographic'][:,-1])
models = specify_models()
best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(data,info, models)


In [None]:
print(best_model_name)
print(best_model)
print(train_set_score)
print(test_set_score)

Extremely Randomized Trees Classifier
GridSearchCV(cv=5, estimator=ExtraTreesClassifier(),
             param_grid={'n_estimators': [10, 20, 50, 100, 200]},
             scoring='roc_auc')
0.8666666666666666
0.4935064935064935


In [None]:
mat = scipy.io.loadmat('/content/cluster_4_data.mat')
data = pd.DataFrame(mat['result_data'])
info = pd.DataFrame(mat['result_demographic'][:,-1])
models = specify_models()
best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(data,info, models)


In [None]:
print(best_model_name)
print(best_model)
print(train_set_score)
print(test_set_score)

Support Vector Classifier with Linear Kernel
GridSearchCV(cv=5, estimator=LinearSVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
             scoring='roc_auc')
nan
0.7333333333333333


In [None]:
mat = scipy.io.loadmat('/content/cluster_5_data.mat')
data = pd.DataFrame(mat['result_data'])
info = pd.DataFrame(mat['result_demographic'][:,-1])
models = specify_models()
best_model_name, best_model, train_set_score, test_set_score = auto_train_binary_classifier(data,info, models)


In [None]:
print(best_model_name)
print(best_model)
print(train_set_score)
print(test_set_score)

Support Vector Classifier with Linear Kernel
GridSearchCV(cv=5, estimator=LinearSVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
             scoring='roc_auc')
nan
0.7333333333333333
