In [2]:
import openml
import pandas as pd
import plotly.express as px 
from sklearn.utils import shuffle
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold 
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [6]:
# List all datasets and their properties
openml.datasets.list_datasets(output_format="dataframe")

# Get dataset by ID
dataset = openml.datasets.get_dataset(1476)

# Get the data itself as a dataframe (or otherwise)
X, _, _, _ = dataset.get_data(dataset_format="dataframe")

# **Questions** 

### Answer to question 1:

This archive contains 13910 measurements from 16 chemical sensors utilized in simulations for drift compensation in a discrimination task of 6 gases at various levels of concentrations.
The goal is to achieve good performance (or as low degradation as possible) over time, as reported in the paper mentioned below in Section 2: Data collection.
The primary purpose of providing this dataset is to make it freely accessible online to the chemo-sensor research community and artificial intelligence to develop strategies to cope with sensor/concept drift. The dataset can be used exclusively for research purposes. Commercial purposes are fully excluded.
The dataset was gathered within January 2007 to February 2011 (36 months) in a gas delivery platform facility situated at the ChemoSignals Laboratory in the BioCircuits Institute, University of California San Diego.
Being completely operated by a fully computerized environment controlled by a LabVIEW's National Instruments software on a PC fitted with the appropriate serial data acquisition boards. The measurement system platform provides versatility for obtaining the desired concentrations of the chemical substances of interest with high accuracy and in a highly reproducible manner, minimizing thereby the common mistakes caused by human intervention and making it possible to exclusively concentrate on the chemical sensors for compensating real drift.
The resulting dataset comprises recordings from six distinct pure gaseous substances, namely Ammonia, Acetaldehyde, Acetone, Ethylene, Ethanol, and Toluene, each dosed at a wide variety of concentration values ranging from 5 to 1000 ppmv

### Answer to question 2:

There was no need to make any changes in order to import the dataset. 

### Answer to question 3:



# **Preparation of the dataset**

In [7]:
X = X.astype('int64')
y = X['Class']
X = X.drop(columns=['Class'])
X,y = shuffle(X,y,random_state=0)
X = X.reset_index(drop= True)
y = y.reset_index(drop= True)

In [8]:
# Train - Test split

Xtrain = X[0:int(0.7*len(X))]
Xtest = X[int(0.7*len(X))+1:]
ytrain = y[0:int(0.7*len(X))]
ytest = y[int(0.7*len(X))+1:]

In [9]:
metrics = pd.DataFrame(columns=['Classifier','Mean Accuracy', 'F1 Score'])

# **Classifiers**

## <span style="color:#6E7866">Dummy Classifier</span>

In [35]:
model = DummyClassifier(strategy="uniform")
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
     
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Dummy', avg_acc_score, avg_f1_score]

figAccScores = px.bar(acc_score, title='Accuracy',width=1000)
figAccScores.update_layout(
    title="Accuracy diagram",
    xaxis_title="Iterations",
    yaxis_title="Accuracy",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#6E7866")   
)
figF1Scores = px.bar(f1_scores, title='F1 Scores',width=1000)
figF1Scores.update_layout(
    title="F1 scores diagram",
    xaxis_title="Iterations",
    yaxis_title="Accuracy",
    font=dict(
    family="Verdana, monospace",
    size=18,
    color="#6E7866")   
)
figAccScores.show()
figF1Scores.show()

In [138]:
# Gaussian Naive Bayes 

model = GaussianNB()
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Gaussian Naive Bayes', avg_acc_score, avg_f1_score]

In [139]:
# kNN

model = KNeighborsClassifier()
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['kNN', avg_acc_score, avg_f1_score]

In [140]:
# Logistic Regression 

model = LogisticRegression(random_state=0)
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Logistic Regression', avg_acc_score, avg_f1_score]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [144]:
# Multi Layer Perceptron

model = MLPClassifier()
acc_score = []
f1_scores = []
k = 10
kf = KFold(n_splits=k, random_state=None)

for train_index , test_index in kf.split(Xtrain):
    X_train , X_test = X.iloc[train_index],X.iloc[test_index]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train, y_train)
    pred_values = model.predict(X_test)
    
    accuracy = model.score(X_test, y_test)
    acc_score.append(accuracy)

    f1Score = f1_score(y_test, model.predict(X_test), average='macro')
    f1_scores.append(f1Score)
     
avg_acc_score = sum(acc_score)/k
avg_f1_score = sum(f1_scores)/k

metrics.loc[len(metrics)] = ['Multi Layer Perceptron', avg_acc_score, avg_f1_score]