In [10]:
import ipynb
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from statistics import mean
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import train_test_split
from numpy import set_printoptions
from statistics import mean
from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample

from sklearn.semi_supervised import SelfTrainingClassifier, LabelPropagation

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
# ONLINE SHOPPING INTENTIONS

filename1 = 'online_shoppers_intentions'
df1r0 = pd.read_csv("../data/train/noresampling/"+filename1+"_0.csv", index_col=0)
df1t = pd.read_csv("../data/test/"+filename1+".csv", index_col=0)
target1 = 'Revenue'

numerical_features1 = ["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", 
                      "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]
categorical_features1 = ["OperatingSystems", "Browser", "Region", "TrafficType", "VisitorType", "Weekend", "Month"]
df1r0[categorical_features1] = df1r0[categorical_features1].astype('category')  
df1t[categorical_features1] = df1t[categorical_features1].astype('category')
df1r0[target1] = df1r0[target1].astype('category') 
df1t[target1] = df1t[target1].astype('category')


# MARKETING CAMPAIGN

filename2 = 'marketing_campaign'
df2r0 = pd.read_csv("../data/train/noresampling/"+filename2+"_0.csv", index_col=0)
df2t = pd.read_csv("../data/test/"+filename2+".csv", index_col=0)
target2 = 'Teenhome'

numerical_features2 = ['Income','MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts','MntSweetProducts', 
                        'MntGoldProds','Year_Birth','Recency','NumDealsPurchases','NumWebPurchases',
                        'NumCatalogPurchases','NumStorePurchases','NumWebVisitsMonth','Dt_Customer']
categorical_features2 = ['Education','Marital_Status','Kidhome','AcceptedCmp3', 
                        'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2','Complain','Response']
df2r0[categorical_features2] = df2r0[categorical_features2].astype('category') 
df2t[categorical_features2] = df2t[categorical_features2].astype('category')
df2r0[target2] = df2r0[target2].astype('category') 
df2t[target2] = df2t[target2].astype('category')

# HEART

filename3 = 'heart'
df3r0 = pd.read_csv("../data/train/noresampling/"+filename3+"_0.csv", index_col=0)
df3t = pd.read_csv("../data/test/"+filename3+".csv", index_col=0)
target3 = 'target'

numerical_features3 = ['trestbps','chol','thalach','oldpeak', 'age']
categorical_features3 = ['sex', 'cp','fbs','restecg','exang','slope','ca','thal']
df3r0[categorical_features3] = df3r0[categorical_features3].astype('category') 
df3t[categorical_features3] = df3t[categorical_features3].astype('category') 
df3r0[target3] = df3r0[target3].astype('category') 
df3t[target3] = df3t[target3].astype('category')

# Sanity Check
#df1r0

In [13]:
def ParameterTuning (dataset_name, dfr, dft, target, numerical_features, categorical_features, model, grid_params):
    
    print("_______________________________________________________________________________ Dataset:"+dataset_name)

    Xr_train = dfr[numerical_features+categorical_features]
    yr_train = dfr[target]
    
    X_test = dft[numerical_features+categorical_features]
    y_test = dft[target]

    mod = GridSearchCV(model, grid_params, verbose=1, cv=3, n_jobs=-1)
    results_r = mod.fit(Xr_train, yr_train)
    print(results_r.best_params_)

In [14]:
# SelfTraining

rf_model = RandomForestClassifier()
gbe_model = GradientBoostingClassifier()
dt_model = DecisionTreeClassifier()

param_grid = {
    'threshold': [0.25, 0.50, 0.75, 0.80, 0.90],
    'criterion': ['threshold', 'k_best'],
    'k_best': [10,100,200],
    'max_iter': [10,100,200,None]
}
print("__________________________________________________________________________________________________RF")
ParameterTuning (filename3, df3r0, df3t, target3, numerical_features3, categorical_features3, SelfTrainingClassifier(rf_model), param_grid)
ParameterTuning (filename2, df2r0, df2t, target2, numerical_features2, categorical_features2, SelfTrainingClassifier(rf_model), param_grid)
ParameterTuning (filename1, df1r0, df1t, target1, numerical_features1, categorical_features1, SelfTrainingClassifier(rf_model), param_grid)

print("__________________________________________________________________________________________________GBE")
ParameterTuning (filename3, df3r0, df3t, target3, numerical_features3, categorical_features3, SelfTrainingClassifier(gbe_model), param_grid)
ParameterTuning (filename2, df2r0, df2t, target2, numerical_features2, categorical_features2, SelfTrainingClassifier(gbe_model), param_grid)
ParameterTuning (filename1, df1r0, df1t, target1, numerical_features1, categorical_features1, SelfTrainingClassifier(gbe_model), param_grid)

print("__________________________________________________________________________________________________DT")
ParameterTuning (filename3, df3r0, df3t, target3, numerical_features3, categorical_features3, SelfTrainingClassifier(dt_model), param_grid)
ParameterTuning (filename2, df2r0, df2t, target2, numerical_features2, categorical_features2, SelfTrainingClassifier(dt_model), param_grid)
ParameterTuning (filename1, df1r0, df1t, target1, numerical_features1, categorical_features1, SelfTrainingClassifier(dt_model), param_grid)



__________________________________________________________________________________________________RF
_______________________________________________________________________________ Dataset:heart
Fitting 3 folds for each of 120 candidates, totalling 360 fits
{'criterion': 'threshold', 'k_best': 10, 'max_iter': 10, 'threshold': 0.8}
_______________________________________________________________________________ Dataset:marketing_campaign
Fitting 3 folds for each of 120 candidates, totalling 360 fits
{'criterion': 'threshold', 'k_best': 200, 'max_iter': 10, 'threshold': 0.8}
_______________________________________________________________________________ Dataset:online_shoppers_intentions
Fitting 3 folds for each of 120 candidates, totalling 360 fits
{'criterion': 'threshold', 'k_best': 100, 'max_iter': 200, 'threshold': 0.9}
__________________________________________________________________________________________________GBE
________________________________________________________________

In [16]:
# Label Propogation

param_grid = {
    'kernel': ['knn','rbf'],
    'gamma': [20,40],
    'n_neighbors': [3,5],
    'max_iter': [10,100,200,None],
    'tol':[0.0001, 0.001]
}

ParameterTuning (filename3, df3r0, df3t, target3, numerical_features3, categorical_features3, LabelPropagation(), param_grid)
ParameterTuning (filename2, df2r0, df2t, target2, numerical_features2, categorical_features2, LabelPropagation(), param_grid)
ParameterTuning (filename1, df1r0, df1t, target1, numerical_features1, categorical_features1, LabelPropagation(), param_grid)


_______________________________________________________________________________ Dataset:heart
Fitting 3 folds for each of 512 candidates, totalling 1536 fits
{'gamma': 80, 'kernel': 'rbf', 'max_iter': 10, 'n_neighbors': 3, 'tol': 0.0001}
_______________________________________________________________________________ Dataset:marketing_campaign
Fitting 3 folds for each of 512 candidates, totalling 1536 fits
{'gamma': 20, 'kernel': 'knn', 'max_iter': 10, 'n_neighbors': 7, 'tol': 0.0001}
_______________________________________________________________________________ Dataset:online_shoppers_intentions
Fitting 3 folds for each of 512 candidates, totalling 1536 fits


KeyboardInterrupt: 