In [3]:
import openml
import pandas as pd
import plotly.express as px 
from sklearn.utils import shuffle
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold 
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

In [4]:
# List all datasets and their properties
openml.datasets.list_datasets(output_format="dataframe")

# Get dataset by ID
dataset = openml.datasets.get_dataset(1476)

# Get the data itself as a dataframe (or otherwise)
X, _, _, _ = dataset.get_data(dataset_format="dataframe")

# **<span style="color:#3c1518">Questions</span>**

### <span style="color:#69140e">Answer to question 1: </span>

<span style="color:#a44200">This archive contains 13910 measurements from 16 chemical sensors utilized in simulations for drift compensation in a discrimination task of 6 gases at various levels of concentrations.
The goal is to achieve good performance (or as low degradation as possible) over time, as reported in the paper mentioned below in Section 2: Data collection.
The primary purpose of providing this dataset is to make it freely accessible online to the chemo-sensor research community and artificial intelligence to develop strategies to cope with sensor/concept drift. The dataset can be used exclusively for research purposes. Commercial purposes are fully excluded.
The dataset was gathered within January 2007 to February 2011 (36 months) in a gas delivery platform facility situated at the ChemoSignals Laboratory in the BioCircuits Institute, University of California San Diego.
Being completely operated by a fully computerized environment controlled by a LabVIEW's National Instruments software on a PC fitted with the appropriate serial data acquisition boards. The measurement system platform provides versatility for obtaining the desired concentrations of the chemical substances of interest with high accuracy and in a highly reproducible manner, minimizing thereby the common mistakes caused by human intervention and making it possible to exclusively concentrate on the chemical sensors for compensating real drift.
The resulting dataset comprises recordings from six distinct pure gaseous substances, namely Ammonia, Acetaldehyde, Acetone, Ethylene, Ethanol, and Toluene, each dosed at a wide variety of concentration values ranging from 5 to 1000 ppmv</span>

### <span style="color:#69140e">Answer to question 2: </span>

<span style="color:#a44200">There was no need to make any changes in order to import the dataset.</span>

### <span style="color:#69140e">Answer to question 3: </span>

<span style="color:#a44200"></span>

# **<span style="color:#3c1518">Preparation of the Dataset</span>**

In [5]:
X = X.astype('int64')
y = X['Class']
X = X.drop(columns=['Class'])
X,y = shuffle(X,y,random_state=0)
X = X.reset_index(drop= True)
y = y.reset_index(drop= True)

## <span style="color:#69140e">Train - Test Split </span>

In [6]:
Xtrain = X[0:int(0.7*len(X))]
Xtest = X[int(0.7*len(X))+1:]
ytrain = y[0:int(0.7*len(X))]
ytest = y[int(0.7*len(X))+1:]

In [5]:
metrics = pd.DataFrame(columns=['Classifier','Mean Accuracy', 'F1 Score'])

# **<span style="color:#3c1518">Classifiers</span>**

## <span style="color:#69140e">Dummy Classifier</span>

In [6]:
model = DummyClassifier(strategy="uniform")
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
     
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Dummy', avg_acc_score, avg_f1_score]

figAccScores = px.bar(acc_score, title='Accuracy',width=1000)
figAccScores.update_layout(
    title="Accuracy diagram",
    xaxis_title="Iterations",
    yaxis_title="Accuracy",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figAccScores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figF1Scores = px.bar(f1_scores, title='F1 Scores',width=1000)
figF1Scores.update_layout(
    title="F1 score diagram",
    xaxis_title="Iterations",
    yaxis_title="F1 Score",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figF1Scores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figAccScores.show()
figF1Scores.show()

## <span style="color:#69140e">Gaussian Naive Bayes</span>

In [7]:
model = GaussianNB()
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Gaussian Naive Bayes', avg_acc_score, avg_f1_score]

figAccScores = px.bar(acc_score, title='Accuracy',width=1000)
figAccScores.update_layout(
    title="Accuracy diagram",
    xaxis_title="Iterations",
    yaxis_title="Accuracy",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figAccScores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figF1Scores = px.bar(f1_scores, title='F1 Scores',width=1000)
figF1Scores.update_layout(
    title="F1 score diagram",
    xaxis_title="Iterations",
    yaxis_title="F1 Score",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figF1Scores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figAccScores.show()
figF1Scores.show()

## <span style="color:#69140e">kNN</span>

In [8]:
model = KNeighborsClassifier()
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['kNN', avg_acc_score, avg_f1_score]

figAccScores = px.bar(acc_score, title='Accuracy',width=1000)
figAccScores.update_layout(
    title="Accuracy diagram",
    xaxis_title="Iterations",
    yaxis_title="Accuracy",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figAccScores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figF1Scores = px.bar(f1_scores, title='F1 Scores',width=1000)
figF1Scores.update_layout(
    title="F1 score diagram",
    xaxis_title="Iterations",
    yaxis_title="F1 Score",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figF1Scores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figAccScores.show()
figF1Scores.show()

## <span style="color:#69140e">Logistic Regression</span>

In [9]:
model = LogisticRegression(random_state=0)
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Logistic Regression', avg_acc_score, avg_f1_score]

figAccScores = px.bar(acc_score, title='Accuracy',width=1000)
figAccScores.update_layout(
    title="Accuracy diagram",
    xaxis_title="Iterations",
    yaxis_title="Accuracy",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figAccScores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figF1Scores = px.bar(f1_scores, title='F1 Scores',width=1000)
figF1Scores.update_layout(
    title="F1 score diagram",
    xaxis_title="Iterations",
    yaxis_title="F1 Score",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figF1Scores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figAccScores.show()
figF1Scores.show()


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to th

## <span style="color:#69140e">Multi Layer Perceptron</span>

In [10]:
model = MLPClassifier()
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Multi Layer Perceptron', avg_acc_score, avg_f1_score]

figAccScores = px.bar(acc_score, title='Accuracy',width=1000)
figAccScores.update_layout(
    title="Accuracy diagram",
    xaxis_title="Iterations",
    yaxis_title="Accuracy",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figAccScores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figF1Scores = px.bar(f1_scores, title='F1 Scores',width=1000)
figF1Scores.update_layout(
    title="F1 score diagram",
    xaxis_title="Iterations",
    yaxis_title="F1 Score",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figF1Scores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figAccScores.show()
figF1Scores.show()

## <span style="color:#69140e">SVM</span>

In [12]:
model = SVC(gamma='auto')
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Multi Layer Perceptron', avg_acc_score, avg_f1_score]

figAccScores = px.bar(acc_score, title='Accuracy',width=1000)
figAccScores.update_layout(
    title="Accuracy diagram",
    xaxis_title="Iterations",
    yaxis_title="Accuracy",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figAccScores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figF1Scores = px.bar(f1_scores, title='F1 Scores',width=1000)
figF1Scores.update_layout(
    title="F1 score diagram",
    xaxis_title="Iterations",
    yaxis_title="F1 Score",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#3c1518")   
)
figF1Scores.update_traces(marker_color='#a44200', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.0)
figAccScores.show()
figF1Scores.show()

# **<span style="color:#3c1518">Optimization</span>**

## <span style="color:#69140e">Dummy Classifier</span>

In [13]:
model = DummyClassifier()
param_grid = {"strategy": ['most_frequent', 'prior', 'stratified', 'uniform']}
search = HalvingGridSearchCV(model, param_grid,max_resources='auto',random_state=0).fit(Xtrain, ytrain)
print('The best strategy is: ',search.best_params_['strategy'])


The best strategy is:  most_frequent
