<a href="https://colab.research.google.com/github/anushka-code/Code-Smell-Classification/blob/main/Code_Smell_Classification_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Structural Branch of the NN-Architecture : CNN 1-D for Structural Object Oriented Metric Feature Extraction

##Code Smells Targetted: 


1.   Long Parameters List
2.   Switch Statements







###Importing Libraries

In [35]:
import numpy as np
import pandas as pd
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Input, Convolution1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.utils import np_utils
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold 
import imblearn
from imblearn.over_sampling import SMOTE

###Mounting Google Drive

In [36]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
    
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

###Dataset Loader

In [37]:
def DataLoader(link, name_of_file):
  id = link.split("/")[-2]
  downloaded = drive.CreateFile({'id':id}) 
  downloaded.GetContentFile(name_of_file)
  dataframe = pd.read_csv(name_of_file)
  return dataframe

link1 = 'https://drive.google.com/file/d/1EfbAqgr7i9h4yFwEoU3igG34Gt48l6WT/view?usp=sharing'
link2 = 'https://drive.google.com/file/d/1Ya1OMWsz1yyXAaZheIck-roX0M9UWiqg/view?usp=sharing'
name1 = 'long_parameters_list_structural.csv'
name2 = 'switch_statements_structural.csv'

df_lp = DataLoader(link1, name1)
df_ss = DataLoader(link2, name2)

Long Parameters List Dataset

In [38]:
df_lp.head(5)

Unnamed: 0,NOP_method,CC_method,ATFD_method,FDP_method,CM_method,MAXNESTING_method,LOC_method,CYCLO_method,NMCS_method,NOLV_method,...,NOI_package,LOC_package,NOM_package,NOPK_project,NOCS_project,NOI_project,NOM_project,NOMNAMM_project,LOC_project,is_long_parameters_list
0,0,0,0,0,0,2,8,2,0,0,...,0,229,16,43,849,78,5788,4778,65687,False
1,0,0,0,0,0,1,3,1,0,0,...,0,4223,240,43,849,78,5788,4778,65687,False
2,1,0,0,0,0,1,2,1,0,1,...,0,4223,240,43,849,78,5788,4778,65687,False
3,0,5,0,0,9,0,1,1,0,0,...,11,6037,728,43,849,78,5788,4778,65687,False
4,1,0,1,1,0,1,3,1,0,1,...,11,6037,728,43,849,78,5788,4778,65687,False


Switch Statements Dataset

In [39]:
df_ss.head(5)

Unnamed: 0,NOP_method,CC_method,ATFD_method,FDP_method,CM_method,MAXNESTING_method,LOC_method,CYCLO_method,NMCS_method,NOLV_method,...,NOI_package,LOC_package,NOM_package,NOPK_project,NOCS_project,NOI_project,NOM_project,NOMNAMM_project,LOC_project,is_switch_statements
0,1,2,0,0,2,1,5,1,0,1,...,3,439,53,43,849,78,5788,4778,65687,False
1,0,0,0,0,0,2,6,3,0,0,...,0,4223,240,43,849,78,5788,4778,65687,False
2,2,0,0,0,0,1,3,1,0,2,...,0,4223,240,43,849,78,5788,4778,65687,False
3,0,0,0,0,0,2,3,1,0,0,...,0,1212,120,43,849,78,5788,4778,65687,False
4,0,0,0,0,0,1,3,1,0,0,...,0,516,29,43,849,78,5788,4778,65687,False


### Data Pre-Processing 

In [40]:
def PrePro(last_column, dataframe):
  dataframe.rename(columns = {last_column :'is_code_smell'}, inplace = True)
  dataframe['is_code_smell'] = dataframe["is_code_smell"].astype(int)
  Y_part = dataframe.iloc[:,-1:]
  X_part = dataframe.iloc[:,:56]
  X_part = X_part.replace(to_replace =["?"], value = np.nan)
  X_part = X_part.astype(float)
  return X_part,Y_part

X_lp, Y_lp = PrePro('is_long_parameters_list',df_lp)
X_ss, Y_ss = PrePro('is_switch_statements',df_ss)

In [41]:
def MeanforNaN(dataframe):  
  column_means = dataframe.mean()
  dataframe = dataframe.fillna(column_means)
  return dataframe

X_lp = MeanforNaN(X_lp)
X_ss = MeanforNaN(X_ss)

In [42]:
def ConCat(df1,df2):
  code_smells = [df1,df2]
  joint = pd.concat(code_smells)
  return joint

X_train = ConCat(X_lp,X_ss)
Y_train = ConCat(Y_lp,Y_ss)

###Data Normalization

In [43]:
def Normalize(dataframe):
  scaler = MinMaxScaler()
  model = scaler.fit(dataframe)
  scaled_data = model.transform(dataframe)
  return scaled_data

X_sample = Normalize(X_train)

###Synthetic Minority Oversampling Technique (SMOTE) Algo for Imbalanced Datasets


In [44]:
Y_train.value_counts() #1/3rd are smelly, 2/3rd are non smelly 

is_code_smell
0                573
1                267
dtype: int64

In [45]:
Y_sample = Y_train.to_numpy(dtype='int64', copy='True')

In [46]:
def Oversample(X_data,Y_data):

  sm = SMOTE(random_state = 2)
  X_train_res, Y_train_res = sm.fit_resample(X_data, Y_data.ravel())
  return X_train_res, Y_train_res

X_new, Y_new = Oversample(X_sample,Y_sample)

In [47]:
X_train_final = X_new.reshape((1146,56,1))

###Repeated K-Fold Cross Validation for Limited Data


In [48]:
def CrossValidation(dataX,dataY):
  kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None) 

  for train_index, test_index in kf.split(dataX):
        X_crosstrain, X_crosstest = dataX[train_index], dataX[test_index] 
        Y_crosstrain, Y_crosstest = dataY[train_index], dataY[test_index]
        return X_crosstrain,X_crosstest,Y_crosstrain,Y_crosstest

X_crosstrain,X_crosstest,Y_crosstrain,Y_crosstest = CrossValidation(X_train_final, Y_new)

### 1-D CNN Implementation

In [50]:
def evaluate_model(trainX, trainY, testX, testY):
	epochs, batch_size, test_batch_size = 30, 100, 50
	height,width,depth = trainX.shape[0], trainX.shape[1], trainY.shape[2]
	model = Sequential()
	model.add(Convolution1D(filters=64, kernel_size=3, activation='relu', input_shape=(width,depth)))
	model.add(Convolution1D(filters=64, kernel_size=3, activation='relu'))
	model.add(Dropout(0.5))
	model.add(MaxPooling1D(pool_size=2))
	model.add(Flatten())
	model.add(Dense(100, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size)
	_, accuracy = model.evaluate(testX, testY, batch_size=test_batch_size)
	return accuracy