In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, r2_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score


print(sklearn.__version__)


1.5.2


In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets 
  
# metadata  & variable information 
# print(glass_identification.metadata) 
print(glass_identification.variables) 

             name     role         type demographic       description  \
0       Id_number       ID      Integer        None              None   
1              RI  Feature   Continuous        None  refractive index   
2              Na  Feature   Continuous        None            Sodium   
3              Mg  Feature   Continuous        None         Magnesium   
4              Al  Feature   Continuous        None          Aluminum   
5              Si  Feature   Continuous        None           Silicon   
6               K  Feature   Continuous        None         Potassium   
7              Ca  Feature   Continuous        None           Calcium   
8              Ba  Feature   Continuous        None            Barium   
9              Fe  Feature   Continuous        None              Iron   
10  Type_of_glass   Target  Categorical        None              None   

                                    units missing_values  
0                                    None             no  
1    

In [4]:
lr = LogisticRegression(random_state=24)

kfold = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)


params = {
    "solver": [
        "lbfgs",
        "liblinear",
        "newton-cg",
        "sag",
        "saga"
        "newton-cholesky"
    ],
    "multi_class": [
        "ovr",
        "multinomial"
    ],
    "C" : np.linspace(0.001, 10, 20),
}

gcv = GridSearchCV(
    estimator=lr,
    param_grid=params,
    cv=kfold
)


In [5]:

gcv.fit(X, y['Type_of_glass'])

print(gcv.best_params_)
print(gcv.best_score_)
 

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)
 

{'C': np.float64(4.737368421052632), 'multi_class': 'ovr', 'solver': 'newton-cg'}
0.6499446290143964
(200, 16)


In [6]:
# tst_df = pd.read_csv(r"C:\Users\DAI.STUDENTSDC\Desktop\Machine Learning\Data Sets\tst_Glass.csv")
tst_df = pd.DataFrame({
    'RI': {0: 1.5321, 1: 1.5212, 2: 1.5112, 3: 1.5, 4: 1.52, 5: 1.51},
    'Na': {0: 14.0, 1: 15.0, 2: 13.0, 3: 12.4, 4: 13.0, 5: 16.0},
    'Mg': {0: 0.0, 1: 3.0, 2: 3.5, 3: 1.23, 4: 2.4, 5: 2.7},
    'Al': {0: 0.34, 1: 1.23, 2: 2.3, 3: 3.22, 4: 0.34, 5: 4.0},
    'Si': {0: 70.23, 1: 75.9, 2: 73.0, 3: 74.22, 4: 71.22, 5: 70.0},
    'K': {0: 0.001, 1: 0.1, 2: 3.4, 3: 4.5, 4: 3.2, 5: 2.0},
    'Ca': {0: 6.7, 1: 7.0, 2: 14.0, 3: 10.0, 4: 9.0, 5: 6.0},
    'Ba': {0: 1.23, 1: 0.0, 2: 2.3, 3: 3.1, 4: 1.44, 5: 2.9},
    'Fe': {0: 0.0, 1: 0.44, 2: 0.22, 3: 0.1, 4: 0.001, 5: 0.89}
    }
)
tst_df.head(10)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.5321,14.0,0.0,0.34,70.23,0.001,6.7,1.23,0.0
1,1.5212,15.0,3.0,1.23,75.9,0.1,7.0,0.0,0.44
2,1.5112,13.0,3.5,2.3,73.0,3.4,14.0,2.3,0.22
3,1.5,12.4,1.23,3.22,74.22,4.5,10.0,3.1,0.1
4,1.52,13.0,2.4,0.34,71.22,3.2,9.0,1.44,0.001
5,1.51,16.0,2.7,4.0,70.0,2.0,6.0,2.9,0.89


Inferencing : Preding on the Unlabeled Data.

Unlabelled Data : The data that we have not labelled yet.

`.predict()` of any classisifcation model will return the class with the highest probability.

`.predict_proba()` will return the probability of each class.



In [7]:
lr_best = LogisticRegression(random_state=24, C=4.211105263157895, multi_class='ovr', solver='newton-cg')

lr_best.fit(X, y)
lr_best.score(X, y)
print(lr_best.coef_)
print(lr_best.intercept_)
print(lr_best.predict(tst_df))
print(lr_best.predict_proba(tst_df).shape)
# all the index with max probabilities
pd.DataFrame(
    lr_best.predict_proba(tst_df), 
    columns=['1', '2', '3','5', '6',' 7']
).idxmax(axis=1).values 


[[ 0.01244673  0.20921703  2.43086457 -1.90313421  1.34836193  0.54099436
   0.97676708  0.97506705 -1.03669683]
 [ 0.03625799 -1.27332113 -0.1503571   0.62804645 -0.80538737 -0.86619956
  -0.34285763 -1.57901333  1.28224798]
 [-0.04872446  0.1217433   0.77095103 -0.38790095 -0.36299031 -0.59241089
  -0.11140579 -0.79608783 -0.11831733]
 [-0.00692819 -1.13791433 -0.84071934  2.63144883 -0.45410722  1.08723726
   0.11080488 -1.81836668 -0.52352313]
 [-0.01203737  2.520513    0.73649736  1.26916706  1.56570215 -2.83001851
   1.09143967 -3.60068311 -0.73869036]
 [ 0.03907187  0.67790933 -0.74742826  1.35922793  0.59348196 -0.20952724
  -0.37065465  1.84006822 -0.94786754]]
[-115.70837466   77.93603243   21.65903191   41.2568665  -164.26811998
  -52.56780405]
[2 6 1 7 1 7]
(6, 6)


array(['2', '6', '1', ' 7', '1', ' 7'], dtype=object)

or

In [8]:
lr_best = gcv.best_estimator_
print(lr_best.predict(tst_df))
print(lr_best.predict_proba(tst_df))


[2 6 1 7 1 7]
[[6.06490293e-06 6.90237024e-01 2.49118390e-02 1.42060701e-03
  1.37083232e-06 2.83423094e-01]
 [4.02386596e-01 7.96590469e-03 1.72697063e-02 4.09155686e-05
  4.35855019e-01 1.36481858e-01]
 [8.42500396e-01 1.25661105e-04 4.91517638e-04 1.99469979e-02
  8.96225405e-09 1.36935418e-01]
 [2.60785287e-01 6.78519761e-05 9.14307615e-06 2.79235676e-01
  2.82965342e-13 4.59902042e-01]
 [9.34789296e-01 2.12047351e-02 9.88749710e-03 7.75815622e-03
  1.93650686e-10 2.63603151e-02]
 [5.24910581e-05 1.34382091e-02 2.87093803e-03 2.18379258e-02
  7.24340175e-09 9.61800429e-01]]


#### Precision Recall F-1 Score

In [9]:
#Without pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=24, test_size=0.3, stratify=y['Type_of_glass'])
print(y_train['Type_of_glass'].value_counts(normalize=True)*100)
print(y_test['Type_of_glass'].value_counts(normalize=True)*100)

lr = LogisticRegression(solver='lbfgs', multi_class='ovr')

lr.fit(X_train, y_train['Type_of_glass'])

Type_of_glass
2    35.570470
1    32.885906
7    13.422819
3     8.053691
5     6.040268
6     4.026846
Name: proportion, dtype: float64
Type_of_glass
2    35.384615
1    32.307692
7    13.846154
3     7.692308
5     6.153846
6     4.615385
Name: proportion, dtype: float64


In [10]:
y_pred = lr.predict(X_test)
print(accuracy_score(y_test['Type_of_glass'], y_pred))
print(confusion_matrix(y_test['Type_of_glass'], y_pred))
print(classification_report(y_test['Type_of_glass'], y_pred))

0.5692307692307692
[[18  3  0  0  0  0]
 [13 10  0  0  0  0]
 [ 3  2  0  0  0  0]
 [ 0  2  0  1  0  1]
 [ 0  2  0  0  0  1]
 [ 0  1  0  0  0  8]]
              precision    recall  f1-score   support

           1       0.53      0.86      0.65        21
           2       0.50      0.43      0.47        23
           3       0.00      0.00      0.00         5
           5       1.00      0.25      0.40         4
           6       0.00      0.00      0.00         3
           7       0.80      0.89      0.84         9

    accuracy                           0.57        65
   macro avg       0.47      0.41      0.39        65
weighted avg       0.52      0.57      0.52        65



**Using F1 score as parameter scoring='f1_macro'**

In [11]:
gcv = GridSearchCV(
    estimator=lr,
    param_grid=params,
    cv=kfold, scoring="f1_macro"
)

In [12]:
gcv.fit(X, y['Type_of_glass'])

print(gcv.best_params_)
print(gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape) 

{'C': np.float64(8.947473684210527), 'multi_class': 'ovr', 'solver': 'newton-cg'}
0.526898505456462
(200, 16)


In [13]:
lr_best = gcv.best_estimator_
print(lr_best.predict(tst_df))
print(lr_best.predict_proba(tst_df))


[2 1 1 7 1 7]
[[2.49510194e-07 9.72457021e-01 1.18600702e-02 4.40912480e-04
  2.11026280e-03 1.31314842e-02]
 [5.65565572e-01 2.93320690e-03 2.70479667e-02 1.94847632e-05
  1.33158810e-01 2.71274960e-01]
 [5.04438960e-01 1.50915252e-07 1.85083275e-04 8.27867340e-03
  1.72234768e-17 4.87097133e-01]
 [3.87677228e-01 4.65889757e-07 3.47884132e-06 2.21111128e-01
  4.18683627e-21 3.91207700e-01]
 [9.56589764e-01 8.99263642e-03 3.84490750e-03 2.70156093e-03
  5.03151376e-13 2.78711313e-02]
 [2.39507603e-04 2.07735222e-03 1.46931644e-03 1.42861432e-02
  1.55965035e-12 9.81927681e-01]]
