<a href="https://colab.research.google.com/github/Yassmina-Abdo/dry-beans-classification/blob/main/Notebooks/trial2_gmm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from xgboost import XGBClassifier
from sklearn.mixture import GaussianMixture
import pickle
from sklearn import svm

# Read Data

In [None]:
def read_data(path):

  df =pd.read_csv(path,index_col='ID')
  print(df.shape) 
  # Target Manual Encoding case train file
  df.replace({'y':{'BARBUNYA':1,'BOMBAY':2,'CALI':3,'DERMASON':4,'HOROZ':5,'SEKER':6,'SIRA':7}},inplace=True)
  return df 

# Preprocessing

>## Upsampling

In [None]:
def apply_Upsmote(train_dfs):
  # Upsample each class to 3000 sample
  data = df.values
  x, y = data[:, :-1], data[:, -1]

  # set each label to 3000
  strategy = {1:3000, 2:3000, 3:3000, 4:3000, 5:3000, 6:3000,7:3000}
  oversample = SMOTE(sampling_strategy=strategy)
  features, target = oversample.fit_resample(x, y)

  # convert to Dataframe
  oversampled_data =pd.concat([ pd.DataFrame(features),pd.DataFrame(target)], axis=1)
  oversampled_data[0]=oversampled_data[0].astype(int) # set ID col as int
  oversampled_data.columns= list(df.columns)
 
  return oversampled_data

In [None]:
def get_numofsamples(df):
  labels=df.y.unique()
  for label in labels:  
    print('Number of samples in class {} = {}'.format(label,len(df[df.y == label])))


>## Cross Validation

In [None]:
def apply_stratifiedKFold(main_data):
  data = main_data.values
  x, y = data[:, :-1], data[:, -1]
  skf = StratifiedKFold(n_splits=5)
  skf.get_n_splits(x, y)
  col_names = main_data.columns.to_list()
  StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
  train_dfs,valid_dfs =list(),list()
 
  for train_index, test_index in skf.split(x, y):
    
      X_train, X_test = x[train_index], x[test_index]
      y_train, y_test = y[train_index], y[test_index]
      trainX = pd.DataFrame(X_train)
      trainy= pd.DataFrame(y_train)

      testX= pd.DataFrame(X_test)
      testy=pd.DataFrame(y_test)

      train=pd.concat([trainX,trainy],axis=1)
      test=pd.concat([testX,testy],axis=1)
      
      train.columns=col_names

      test.columns=col_names
    
      train_dfs.append(train)
      valid_dfs.append(test)

  return train_dfs ,valid_dfs



>## Scaling

In [None]:
def apply_MinMaxScaler(X):
  scaler = MinMaxScaler()
  scaler.fit(X)
  return scaler


>## Features Extraction

In [None]:
def apply_pca(scaled_X,n_components):
  pca = PCA(n_components)
  pca.fit(scaled_X)
  return pca


# Clustering

In [None]:
def apply_gmm(pca_X):
  gmm = GaussianMixture(n_components=7)
  gmm.fit(pca_X)
  probability= gmm.predict_proba(pca_X)
  gmm_x = pd.concat([pd.DataFrame(pca_X),pd.DataFrame(probability)],axis=1)
  return gmm, gmm_x

# Model

In [None]:
def apply_model(modelname,xtrain,ytrain,xvalid,yvalid):
  if modelname=='XGBoost':
    score_train,f1score_val,model=apply_XGBoost(xtrain,ytrain,xvalid,yvalid)

  if modelname=='SVM':
    score_train,f1score_val,model=apply_svm(xtrain,ytrain,xvalid,yvalid)

  return score_train,f1score_val,model

In [None]:
def apply_XGBoost(xtrain,ytrain,xvalid,yvalid):
  xgb_model = XGBClassifier(learning_rate=0.3 , random_state =42, objective='multi:softmax', max_depth=6, reg_alpha = 0.08, gamma=0.1, verbosity=0) 
  xgb_model.fit(xtrain,ytrain)
  predictions = xgb_model.predict(xvalid)

  score_train = xgb_model.score(xtrain, ytrain)
  f1score_val = f1_score(yvalid, predictions, average="micro")
  print(f'Train score: {score_train} \t  valiation F1 score : {f1score_val}')
  return score_train,f1score_val,xgb_model



In [None]:
def apply_svm(xtrain,ytrain,xvalid,yvalid):
  rbf = svm.SVC(C= 100, gamma= 1, kernel= 'rbf')
  rbf.fit(xtrain,ytrain)
  predictions = rbf.predict(xvalid)

  score_train = rbf.score(xtrain, ytrain)
  f1score_val = f1_score(yvalid, predictions, average="micro")
  print(f'Train score: {score_train} \t  valiation F1 score : {f1score_val}')
  return score_train,f1score_val,rbf


# Run

In [None]:
def run_experiment(path,modelname):
  # 1. raed data
  main_data= read_data(path=path)
 
  
  # 2. Upsampling
  df=apply_Upsmote(main_data)
  
  # 3. Apply Kfold
  train_dfs ,valid_dfs= apply_stratifiedKFold(df)
 
  # 4. do for each fold
  Accuracies_train,Accuracies_valid=list(),list()
  for train_df , valid_df in zip(train_dfs,valid_dfs):
    #train_df= shuffle(train_df)  #1. shuffle
    scaler= apply_MinMaxScaler(train_df.iloc[:,:-1])  #2. scaling
    scaled_train = scaler.transform(train_df.iloc[:,:-1])
    pca = apply_pca(scaled_train ,n_components=7) #3. extract features
    pca_train = pca.transform(scaled_train)

    gmm,gmm_train = apply_gmm(pca_train)

    # Apply on validation
    scaled_valid=scaler.transform(valid_df.iloc[:,:-1])
    pca_valid=pca.transform(scaled_valid)

    probability= gmm.predict_proba(pca_valid)
    gmm_valid = pd.concat([pd.DataFrame(pca_valid),pd.DataFrame(probability)],axis=1)
  

    xtrain,ytrain =gmm_train,train_df.iloc[:,-1]
    xvalid,yvalid =gmm_valid,valid_df.iloc[:,-1]
    
    score_train,f1score_val,model=apply_model(modelname, xtrain,ytrain,xvalid,yvalid) #4. Apply model
    Accuracies_train.append(score_train)
    Accuracies_valid.append(f1score_val)

  return np.mean(Accuracies_train), np.mean(Accuracies_valid),scaler,pca,model

In [None]:
path ='/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Original/train.csv'
avg_acc_train,avg_acc_valid,scaler,pca,model= run_experiment(path,modelname='SVM')
print(f'=========\nTotal Avg Acc Of Train = {avg_acc_train}  Total Avg Acc Of Valid = {avg_acc_valid}')

In [None]:
# save model 
with open('model.pkl', 'wb') as file:
        pickle.dump(model, file)

# Test

In [None]:
path ='/content/drive/MyDrive/Dry Beans Classification Competition/Dataset/Original/test.csv'
df= pd.read_csv(path)
scaled_df= scaler.transform(df.iloc[:,1:])
ft_selected=pca.transform(scaled_df)

# Load pretrained model
pkl_filename='model.pkl'
with open(pkl_filename, 'rb') as file:
    pretrained_model = pickle.load(file)

prediction= pretrained_model.predict(ft_selected)
pred_df= pd.DataFrame(prediction)
submit_df =pd.concat([df.iloc[:,0],pred_df],axis=1)
submit_df.columns=['ID','y']
submit_df.replace({'y':{1:'BARBUNYA',2:'BOMBAY',3:'CALI',4:'DERMASON',5:'HOROZ',6:'SEKER',7:'SIRA'}},inplace=True)
submit_df.to_csv('submission.csv',index= False)
submit_df

Unnamed: 0,ID,y
0,10834,HOROZ
1,10835,DERMASON
2,10836,BARBUNYA
3,10837,DERMASON
4,10838,BOMBAY
...,...,...
2704,13538,CALI
2705,13539,SEKER
2706,13540,HOROZ
2707,13541,DERMASON
