# importing libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import classification_report

reading the data into dataframe

In [5]:
train = pd.read_csv('TestPad_PCB_XYRGB_V2.csv')


In [7]:
train.head(10)

Unnamed: 0,X,Y,R,G,B,Grey
0,105,0,0.91,0.98,0.94,0
1,106,0,0.79,0.9,0.84,0
2,107,0,0.63,0.79,0.69,0
3,108,0,0.47,0.67,0.56,0
4,109,0,0.34,0.58,0.44,0
5,110,0,0.27,0.54,0.39,0
6,111,0,0.25,0.54,0.36,0
7,112,0,0.25,0.55,0.37,0
8,113,0,0.25,0.55,0.36,0
9,114,0,0.25,0.55,0.36,0


getting basic info

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 723552 entries, 0 to 723551
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   X       723552 non-null  int64  
 1   Y       723552 non-null  int64  
 2   R       723552 non-null  float64
 3   G       723552 non-null  float64
 4   B       723552 non-null  float64
 5   Grey    723552 non-null  int64  
dtypes: float64(3), int64(3)
memory usage: 33.1 MB


getting value couts of target feature

In [13]:
train['Grey'].value_counts()

Grey
0    711649
1     11903
Name: count, dtype: int64

# Splitting the dataset

In [16]:
train,test = train_test_split(train,test_size=0.2,random_state=42,stratify=train['Grey'])
print(train.shape,test.shape)

(578841, 6) (144711, 6)




reducing the memory usage

In [21]:
def reduce_memory_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [23]:
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)
print(train.info())
print(test.info())

Memory usage of dataframe is 30.91 MB
Memory usage after optimization is: 10.49 MB
Decreased by 66.1%
Memory usage of dataframe is 7.73 MB
Memory usage after optimization is: 2.62 MB
Decreased by 66.1%
<class 'pandas.core.frame.DataFrame'>
Index: 578841 entries, 260400 to 202937
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   X       578841 non-null  int16  
 1   Y       578841 non-null  int16  
 2   R       578841 non-null  float16
 3   G       578841 non-null  float16
 4   B       578841 non-null  float16
 5   Grey    578841 non-null  int8   
dtypes: float16(3), int16(2), int8(1)
memory usage: 10.5 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 144711 entries, 548451 to 201422
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   X       144711 non-null  int16  
 1   Y       144711 non-null  int16  
 2   R       144711 non-null  float16
 3   G       144711

taking a sample of 30000 training set

In [26]:
train=train.sample(n=30000)
train.shape

(30000, 6)

# Splitting for model training

splitting feature and target columns

In [30]:
y = train.pop('Grey')
X = train

splitting X and y into training and test 

In [33]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)


# Model training

In [36]:
clf = LazyClassifier(verbose=0,predictions=True)
models,predictions = clf.fit(X_train,X_test,y_train,y_test)
models

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:38<00:00,  1.33s/it]

[LightGBM] [Info] Number of positive: 403, number of negative: 23597
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000311 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1111
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.016792 -> initscore=-4.069938
[LightGBM] [Info] Start training from score -4.069938





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoostClassifier,1.0,1.0,1.0,1.0,0.56
BaggingClassifier,1.0,1.0,1.0,1.0,0.1
XGBClassifier,1.0,1.0,1.0,1.0,0.21
RandomForestClassifier,1.0,1.0,1.0,1.0,0.71
LGBMClassifier,1.0,1.0,1.0,1.0,0.19
DecisionTreeClassifier,1.0,1.0,1.0,1.0,0.03
ExtraTreesClassifier,1.0,0.98,0.98,1.0,0.42
GaussianNB,0.99,0.9,0.9,0.99,0.03
NearestCentroid,0.77,0.88,0.88,0.85,0.02
ExtraTreeClassifier,0.99,0.88,0.88,0.99,0.02


In [37]:
predictions.head()

Unnamed: 0,AdaBoostClassifier,BaggingClassifier,BernoulliNB,CalibratedClassifierCV,DecisionTreeClassifier,DummyClassifier,ExtraTreeClassifier,ExtraTreesClassifier,GaussianNB,KNeighborsClassifier,...,PassiveAggressiveClassifier,Perceptron,QuadraticDiscriminantAnalysis,RandomForestClassifier,RidgeClassifier,RidgeClassifierCV,SGDClassifier,SVC,XGBClassifier,LGBMClassifier
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Classification reports

In [39]:
for i in predictions.columns.to_list():
    print(classification_report(y_test,predictions[i]),'\n')

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5899
           1       1.00      1.00      1.00       101

    accuracy                           1.00      6000
   macro avg       1.00      1.00      1.00      6000
weighted avg       1.00      1.00      1.00      6000
 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5899
           1       1.00      1.00      1.00       101

    accuracy                           1.00      6000
   macro avg       1.00      1.00      1.00      6000
weighted avg       1.00      1.00      1.00      6000
 

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5899
           1       0.00      0.00      0.00       101

    accuracy                           0.98      6000
   macro avg       0.49      0.50      0.50      6000
weighted avg       0.97      0.98      0.97      6000
 

             