In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [2]:
#Import iris dataset
glass = fetch_openml(name='glass')

  " {version}.".format(name=name, version=res[0]['version']))


In [3]:
# Have a look at the dataset
print(glass)

{'data': array([[ 1.51793, 12.79   ,  3.5    , ...,  8.77   ,  0.     ,  0.     ],
       [ 1.51643, 12.16   ,  3.52   , ...,  8.53   ,  0.     ,  0.     ],
       [ 1.51793, 13.21   ,  3.48   , ...,  8.43   ,  0.     ,  0.     ],
       ...,
       [ 1.51613, 13.92   ,  3.52   , ...,  7.94   ,  0.     ,  0.14   ],
       [ 1.51689, 12.67   ,  2.88   , ...,  8.54   ,  0.     ,  0.     ],
       [ 1.51852, 14.09   ,  2.19   , ...,  9.32   ,  0.     ,  0.     ]]), 'target': array(['build wind float', 'vehic wind float', 'build wind float',
       'tableware', 'build wind non-float', 'build wind non-float',
       'vehic wind float', 'build wind float', 'headlamps',
       'build wind non-float', 'build wind non-float',
       'build wind non-float', 'build wind float', 'vehic wind float',
       'vehic wind float', 'build wind non-float', 'headlamps',
       'build wind non-float', 'containers', 'build wind non-float',
       'build wind float', 'build wind non-float', 'build wind non-fl

In [4]:
# Dataset URL
glass.url

'https://www.openml.org/d/41'

In [5]:
# Non graphical EDA
glass.data.shape

(214, 9)

In [6]:
# Non graphical EDA
glass.target.shape

(214,)

In [7]:
# Non graphical EDA
np.unique(glass.target)

array(['build wind float', 'build wind non-float', 'containers',
       'headlamps', 'tableware', 'vehic wind float'], dtype=object)

In [8]:
# Non graphical EDA
glass.DESCR

'**Author**: B. German  \n**Source**: [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/glass+identification) - 1987-09-01  \n**Please cite**: [UCI citation policy](https://archive.ics.uci.edu/ml/citation_policy.html)  \n\n1. Title: Glass Identification Database\n \n 2. Sources:\n     (a) Creator: B. German\n         -- Central Research Establishment\n            Home Office Forensic Science Service\n            Aldermaston, Reading, Berkshire RG7 4PN\n     (b) Donor: Vina Spiehler, Ph.D., DABFT\n                Diagnostic Products Corporation\n                (213) 776-0180 (ext 3014)\n     (c) Date: September, 1987\n \n 3. Past Usage:\n     -- Rule Induction in Forensic Science\n        -- Ian W. Evett and Ernest J. Spiehler\n        -- Central Research Establishment\n           Home Office Forensic Science Service\n           Aldermaston, Reading, Berkshire RG7 4PN\n        -- Unknown technical note number (sorry, not listed here)\n        -- General Results:

In [9]:
# Non graphical EDA
glass.details

{'id': '41',
 'name': 'glass',
 'version': '1',
 'format': 'ARFF',
 'creator': 'B. German',
 'collection_date': '1987-09-01',
 'upload_date': '2014-04-06T23:22:26',
 'language': 'English',
 'licence': 'Public',
 'url': 'https://www.openml.org/data/v1/download/41/glass.arff',
 'file_id': '41',
 'default_target_attribute': 'Type',
 'version_label': '1',
 'citation': 'https://archive.ics.uci.edu/ml/citation_policy.html',
 'tag': ['study_1', 'study_41', 'study_7', 'study_76', 'study_88', 'uci'],
 'visibility': 'public',
 'original_data_url': 'https://archive.ics.uci.edu/ml/datasets/glass+identification',
 'paper_url': 'https://dl.acm.org/doi/abs/10.5555/67040.67055',
 'status': 'active',
 'processing_date': '2020-11-20 20:02:43',
 'md5_checksum': 'd38f2d5484e30ec447bbd8b6d3354460'}

In [10]:
glass.feature_names

['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe']

In [11]:
x = pd.DataFrame(glass.data, columns = glass.feature_names)
x.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.51793,12.79,3.5,1.12,73.03,0.64,8.77,0.0,0.0
1,1.51643,12.16,3.52,1.35,72.89,0.57,8.53,0.0,0.0
2,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.0,0.0
3,1.51299,14.4,1.74,1.54,74.55,0.0,7.59,0.0,0.0
4,1.53393,12.3,0.0,1.0,70.16,0.12,16.19,0.0,0.24


In [12]:
y = pd.DataFrame(glass.target, columns = ['Class'])
y.head()

Unnamed: 0,Class
0,build wind float
1,vehic wind float
2,build wind float
3,tableware
4,build wind non-float


In [17]:
model = LogisticRegression()

accs = []
pres = []
recs = []
f1_scores = []

# Training model with Repeated stratified K fold cross validation
rskf = RepeatedStratifiedKFold(n_splits=10,
                               n_repeats = 10, 
                               random_state=36851234)

for train_index, test_index in rskf.split(x, y):
    model.fit(x.iloc[train_index], y.iloc[train_index])
    y_pred = model.predict(x.iloc[test_index])
    acc_score = accuracy_score(y.iloc[test_index], y_pred)
    prec_score = precision_score(y.iloc[test_index], y_pred, average = 'micro')
    rec_score = recall_score(y.iloc[test_index], y_pred, average = 'micro')
    f1s = f1_score(y.iloc[test_index], y_pred, average = 'micro')
    
    accs.append(acc_score)
    pres.append(prec_score)
    recs.append(rec_score)
    f1_scores.append(f1s)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [18]:
print("Accuracy :", np.mean(accs), "\nPrecision :", 
      np.mean(pres), "\nRecall :", np.mean(rec_score), 
      "\nF1 score :", np.mean(f1_scores) )

Accuracy : 0.6163127581800352 
Precision : 0.6163127581800352 
Recall : 0.3684210526315789 
F1 score : 0.6163127581800352
