<a href="https://colab.research.google.com/github/Yassmina-Abdo/dry-beans-classification/blob/main/Notebooks/drybeans_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

# 1. Upsampling By SMOTE

In [None]:
# Read Data
df_train =pd.read_csv('/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Original/train.csv',index_col='ID')
print(df_train.shape)
# Sort Col y 
df_train.sort_values('y', inplace=True)
# Target Manual Encoding
df_train.replace({'y':{'BARBUNYA':1,'BOMBAY':2,'CALI':3,'DERMASON':4,'HOROZ':5,'SEKER':6,'SIRA':7}},inplace=True) 

(10834, 17)


In [None]:
def get_numofsamples(df):
  labels=df.y.unique()
  for label in labels:  
    print('Number of samples in class {} = {}'.format(label,len(df[df.y == label])))


In [None]:
get_numofsamples(df_train)

Number of samples in class 1 = 1057
Number of samples in class 2 = 418
Number of samples in class 3 = 1304
Number of samples in class 4 = 2837
Number of samples in class 5 = 1488
Number of samples in class 6 = 1621
Number of samples in class 7 = 2109


In [None]:
# Upsamplint each class to 3000 sample
data = df_train.values
x, y = data[:, :-1], data[:, -1]

# set each label to 3000
strategy = {1:3000, 2:3000, 3:3000, 4:3000, 5:3000, 6:3000,7:3000}
oversample = SMOTE(sampling_strategy=strategy)
features, target = oversample.fit_resample(x, y)

# convert to Dataframe
train_data =pd.concat([ pd.DataFrame(features),pd.DataFrame(target)], axis=1)
train_data[0]=train_data[0].astype(int) # set ID col as int
train_data.columns= list(df_train.columns)

In [None]:
get_numofsamples(train_data)

Number of samples in class 1 = 3000
Number of samples in class 2 = 3000
Number of samples in class 3 = 3000
Number of samples in class 4 = 3000
Number of samples in class 5 = 3000
Number of samples in class 6 = 3000
Number of samples in class 7 = 3000


In [None]:
# Save to csv File
train_data.to_csv('/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Preprocessing/trainvalid_oversample.csv',index_label='ID')

#2. Cross Validation

In [None]:
oversampled_data = pd.read_csv('/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Preprocessing/trainvalid_oversample.csv',index_col='ID')
data = oversampled_data.values
x, y = data[:, :-1], data[:, -1]

In [None]:
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(x, y)
col_names = oversampled_data.columns.to_list()

In [None]:
StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
i=1
for train_index, test_index in skf.split(x, y):
     print("TRAIN:", train_index, "TEST:", test_index)
     X_train, X_test = x[train_index], x[test_index]
     y_train, y_test = y[train_index], y[test_index]
     trainX = pd.DataFrame(X_train)
     trainy= pd.DataFrame(y_train)

     testX= pd.DataFrame(X_test)
     testy=pd.DataFrame(y_test)

     train=pd.concat([trainX,trainy],axis=1)
     test=pd.concat([testX,testy],axis=1)
     
     train.columns=col_names

     test.columns=col_names

     train.to_csv(f'/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Preprocessing/Train/trial_{i}_train.csv',index_label='ID')
     test.to_csv(f'/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Preprocessing/Validation/trial_{i}_test.csv',index_label='ID')
     i+=1


TRAIN: [  600   601   602 ... 20997 20998 20999] TEST: [    0     1     2 ... 12956 12957 12958]
TRAIN: [    0     1     2 ... 20997 20998 20999] TEST: [  600   601   602 ... 13556 13557 13558]
TRAIN: [    0     1     2 ... 20997 20998 20999] TEST: [ 2675  2676  2677 ... 18906 18907 18908]
TRAIN: [    0     1     2 ... 20997 20998 20999] TEST: [ 4579  4580  4581 ... 20397 20398 20399]
TRAIN: [    0     1     2 ... 20397 20398 20399] TEST: [ 5179  5180  5181 ... 20997 20998 20999]
