In [None]:
import pandas as pd
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go

import warnings

warnings.filterwarnings("ignore")

In [None]:
dataset_d = pd.read_csv('data/drug_consumption.csv', header=None,
                        names=['ID', 'Age', 'Gender', 'Education', 'Country', 'Ethnicity', 'Neuroticism',
                               'Extraversion',
                               'Openness-to-experience', 'Agreeableness', 'Conscientiousness', 'Impulsive',
                               'Sensation-seeking', 'Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc',
                               'Coke', 'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms',
                               'Nicotine', 'Semer', 'VSA'])

In [None]:
drugs = ['Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Caff', 'Cannabis', 'Choc', 'Coke', 'Crack', 'Ecstasy', 'Heroin',
         'Ketamine', 'Legalh', 'LSD', 'Meth', 'Mushrooms', 'Nicotine', 'Semer', 'VSA']

dataset_d_binary = dataset_d.replace(['CL0', 'CL1'], "Non-User")
dataset_d_binary = dataset_d_binary.replace(['CL2', 'CL3', 'CL3', 'CL4', 'CL5', 'CL6', ], "User")
dataset_d_binary_drugs = dataset_d_binary[drugs]

In [None]:
drugs_value_count = dataset_d_binary_drugs.apply(pd.Series.value_counts)

In [None]:
trace1 = go.Bar(
    x=drugs,
    y=drugs_value_count.iloc[1],
    name='User',
    marker=dict(color="rgb(117, 127, 221)")
)
trace2 = go.Bar(
    x=drugs,
    y=drugs_value_count.iloc[0],
    name='Non-User',
    marker=dict(color="rgb(191, 221, 229)")
)

data = [trace1, trace2]
layout = go.Layout(
    title='Drug Vs User Or Non-user',
    yaxis=dict(title='Count', ticklen=5, gridwidth=2),
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')

In [None]:
pk = drugs_value_count.iloc[1]
col = [i for i in range(len(pk.values))]
data = [
    go.Bar(
        x=list(pk.index),
        y=list(pk.values),
        marker=dict(color=col, colorscale='Jet', showscale=False)
    ), ]
layout = go.Layout(
    title='Used Drugs Vs Number of Users',
    yaxis=dict(title='Users', ticklen=5, gridwidth=2),
    xaxis=dict(title='Drugs', ticklen=5, gridwidth=2),
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Drug-Count')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from scipy.stats import friedmanchisquare
from scikit_posthocs import posthoc_nemenyi_friedman

In [None]:
models_1 = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest Classifier': RandomForestClassifier(n_estimators=100),
    'Support Vector Machines': SVC(),
    'KNN': KNeighborsClassifier(),
}

models_2 = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest Classifier': RandomForestClassifier(n_estimators=100, random_state=42),
    'Support Vector Machines': SVC(),
    'KNN': KNeighborsClassifier(),
    'Multi Layer Perceptron': MLPClassifier(hidden_layer_sizes=(256, 128, 64, 32), solver="adam",
                                            learning_rate='adaptive', activation="relu", random_state=42),
    'Gradient Boosting Ensemble': GradientBoostingClassifier(n_estimators=50),
}

In [None]:
le = LabelEncoder()

benzos_dataset_d = pd.concat([dataset_d_binary.iloc[:, 0:13], dataset_d_binary[drugs]['Benzos']], axis=1, join='inner')
benzos_dataset_d['Benzos'] = le.fit_transform(benzos_dataset_d['Benzos'])
benzos_dataset_d = benzos_dataset_d.drop('ID', axis=1)

coke_dataset_d = pd.concat([dataset_d_binary.iloc[:, 0:13], dataset_d_binary[drugs]['Coke']], axis=1, join='inner')
coke_dataset_d['Coke'] = le.fit_transform(coke_dataset_d['Coke'])
coke_dataset_d = coke_dataset_d.drop('ID', axis=1)

estacy_dataset_d = pd.concat([dataset_d_binary.iloc[:, 0:13], dataset_d_binary[drugs]['Ecstasy']], axis=1, join='inner')
estacy_dataset_d['Coke'] = le.fit_transform(estacy_dataset_d['Ecstasy'])
estacy_dataset_d = estacy_dataset_d.drop('ID', axis=1)

In [None]:
scoring = 'accuracy'
scoring_1 = 'f1'

cv = KFold(n_splits=10, random_state=42, shuffle=True)

X = benzos_dataset_d.drop('Benzos', axis=1)
y = benzos_dataset_d['Benzos']

In [None]:
def train_models(models, _X, _y, _cv, _scoring):
    model_performance = []
    for name, model in models.items():
        model.fit(X, y)
        _scores = cross_val_score(estimator=model, X=_X, y=_y, scoring=_scoring, cv=_cv)
        model_performance.append(_scores.mean())
        print(name)
        print('Mean Accuracy: {:.2%}'.format(_scores.mean()))
        print()
    return model_performance

In [None]:
train_models(models_1, X, y, cv, scoring)

In [None]:
over = RandomOverSampler(random_state=42)
over_smote = SMOTE()
X_db1, y_db1 = over_smote.fit_resample(X, y)

print('Random Over Sampling')

train_models(models_1, X_db1, y_db1, cv, scoring)

In [None]:
under = RandomUnderSampler(random_state=42)
X_db2, y_db2 = under.fit_resample(X, y)

print('Random Under Sampling')

train_models(models_1, X_db2, y_db2, cv, scoring)

In [None]:
print('NORMAL')
performance_d = train_models(models_2, X, y, cv, scoring)

print('RANDOM OVER SAMPLING')
performance_d1 = train_models(models_2, X_db1, y_db1, cv, scoring)

print('RANDOM UNDER SAMPLING')
performance_d2 = train_models(models_2, X_db2, y_db2, cv, scoring)

In [None]:
dataset_h = pd.read_csv('data/heart_cleveland_upload.csv')

X_h = dataset_h.drop('condition', axis=1)
y_h = dataset_h['condition']

print('Heart Disease')

performance_h = train_models(models_2, X_h, y_h, cv, scoring)

In [None]:
dataset_l = pd.read_csv('data/labor.csv', na_values=["?"])
dataset_l

In [None]:
dataset_l.info()

In [None]:
dataset_l.isna().sum()

In [None]:
sns.countplot(x=dataset_l['class'])

In [None]:
le = LabelEncoder()
features = ['cost-of-living-adjustment', 'pension', 'education-allowance', 'vacation', 'longterm-disability-assistance',
            'contribution-to-dental-plan', 'bereavement-assistance', 'contribution-to-health-plan', 'class']
for feature in features:
    dataset_l[feature] = le.fit_transform(dataset_l[feature])

In [None]:
knn_imputer = KNNImputer(n_neighbors=2, weights="uniform")
X_l = knn_imputer.fit_transform(dataset_l.drop('class', axis=1))
y_l = dataset_l['class']

print('LABOUR')

performance_l = train_models(models_2, X_l, y_l, cv, scoring)

In [None]:
performance = [performance_d, performance_d1, performance_d2, performance_h, performance_l]
performance_df = pd.DataFrame(performance, columns=['Decision Tree', 'Random Forest Classifier', 'Support Vector Machines', 'KNN', 'Multi Layer Perceptron', 'Gradient Boosting Ensemble'])

performance_df.insert(0, "Dataset", ['D', 'DB1', 'DB2', 'heart-disease', 'labor-relations'])
performance_df

In [None]:
# compare models
stat, p = friedmanchisquare(
    performance_df['Decision Tree'],
    performance_df['Random Forest Classifier'],
    performance_df['Support Vector Machines'],
    performance_df['KNN'],
    performance_df['Multi Layer Perceptron'],
    performance_df['Gradient Boosting Ensemble']
)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')
    print(posthoc_nemenyi_friedman(performance_df))