<a href="https://colab.research.google.com/github/anushka-code/Code-Smell-Classification/blob/main/Model2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Multimodal Deep Learning : Merging CNN & BiLSTM for Numerical and Textual Features

##Code Smells Targetted: 


1.   Long Parameters List
2.   Switch Statements


###Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow import keras
from keras.models import Functional, Model
from keras.layers import Input, Convolution1D, MaxPooling1D, Flatten, Dense, concatenate
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Bidirectional
import imblearn
from imblearn.over_sampling import SMOTE
import nltk 
from nltk.tokenize import RegexpTokenizer
import re
import gensim
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec

###Mounting Google Drive

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

###Dataset Loader

In [3]:
def DataLoader(link, name_of_file):
  id = link.split("/")[-2]
  downloaded = drive.CreateFile({'id':id}) 
  downloaded.GetContentFile(name_of_file)
  dataframe = pd.read_csv(name_of_file)
  return dataframe


link1 = 'https://drive.google.com/file/d/1EfbAqgr7i9h4yFwEoU3igG34Gt48l6WT/view?usp=sharing'
link2 = 'https://drive.google.com/file/d/1Ya1OMWsz1yyXAaZheIck-roX0M9UWiqg/view?usp=sharing'
link6 = 'https://drive.google.com/file/d/1rLkJAwHkBAAkHMp2L1Y1AzuniIZdyB7x/view?usp=sharing'

name1 = 'long_parameters_list_structural.csv'
name2 = 'switch_statements_structural.csv'
name6 = 'semantic_final.csv'

df_lp = DataLoader(link1, name1)
df_ss = DataLoader(link2, name2)
df_semantic = DataLoader(link6,name6)

###Data Pre-Processing of Structural Dataset


In [4]:
def PrePro(last_column, dataframe):
  dataframe.rename(columns = {last_column :'is_code_smell'}, inplace = True) #rename column
  dataframe['is_code_smell'] = dataframe["is_code_smell"].astype(int) #change boolean labels to int labels
  Y_part = dataframe.iloc[:,-1:]
  X_part = dataframe.iloc[:,:56]
  X_part = X_part.replace(to_replace =["?"], value = np.nan) #replace non existing values with null
  X_part = X_part.astype(float) #change datatype of features of X as float
  return X_part,Y_part

X_lp, Y_lp = PrePro('is_long_parameters_list',df_lp)
X_ss, Y_ss = PrePro('is_switch_statements',df_ss)

In [5]:
def MeanforNaN(dataframe):   #function to fill null spaces with column mean 
  column_means = dataframe.mean()
  dataframe = dataframe.fillna(column_means)
  return dataframe

X_lp = MeanforNaN(X_lp)
X_ss = MeanforNaN(X_ss)

In [6]:
def ConCat(df1,df2): #concatenate code smell datasets
  code_smells = [df1,df2]
  joint = pd.concat(code_smells)
  return joint

X_train = ConCat(X_lp,X_ss)
Y_train = ConCat(Y_lp,Y_ss)

In [7]:
def Normalize(dataframe): #apply MinMax normalisation to fit the values between 0 to 1
  scaler = MinMaxScaler()
  model = scaler.fit(dataframe)
  scaled_data = model.transform(dataframe)
  return scaled_data

X_sample = Normalize(X_train)
Y_sample = Y_train.to_numpy(dtype='int64', copy='True')

###Synthetic Minority Oversampling Technique (SMOTE) Algo for Imbalanced Datasets

In [8]:
def Oversample(X_data,Y_data): #Using Smote obtain a 50-50 balanced dataset 

  sm = SMOTE(random_state = 2)
  X_train_res, Y_train_res = sm.fit_resample(X_data, Y_data.ravel())
  return X_train_res, Y_train_res

X_struct, Y_new = Oversample(X_sample,Y_sample)

###Text Pre Processing

In [9]:
def PrePro2(dataframe):
  X_part = dataframe.iloc[:,:-1]  
  return X_part

X_sem= PrePro2(df_semantic) #separate as X and Y columns

In [10]:
def ToList(dataframe, string): #converts pandas.core.series.Series to list of lists
  new_column = dataframe[string].tolist()
  return list(map(lambda x:[x], new_column))

class_words = ToList(X_sem, 'class')
method_words = ToList(X_sem, 'method')

###Tokenization

In [11]:
def TokenizeGroups(text): #tokenizes as per CamelCase RegEx and converts to lowercase
  tokenizer = RegexpTokenizer('[a-zA-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))') # RegEx pattern for CamelCase 
  useful_text = tokenizer.tokenize(text) 
  useful_text = [x.lower() for x in useful_text]
  return useful_text

In [12]:
def LoopOver(list): #loops over all the samples to tokenize all the strings in each word group
  for i in range(0,1146):
    list[i] = TokenizeGroups(list[i][0])
    i=i+1
  return list

class_words_sem = LoopOver(class_words)
method_words_sem = LoopOver(method_words)

In [13]:
def WordList(list1,list2): #concatenate the words of each project, package, class and method
  final_list = []
  for i in range(0,1146):
    x = list1[i] + list2[i]
    final_list.append(x)
  res = [' '.join(ele) for ele in final_list]
  return res

word_groups = WordList(class_words_sem, method_words_sem)

In [14]:
def MaxNumWords(groupofwords):  
  max = 0 # find the max number of words a sentence has in word groups
  for ele in groupofwords:
    res = len(ele.split())
    if res > max:
      max = res
  return max

max_words = MaxNumWords(word_groups)

In [15]:
token = Tokenizer()
token.fit_on_texts(word_groups)
vocab_size = len(token.word_index) + 1
print(vocab_size)

1300


In [16]:
encoded_text = token.texts_to_sequences(word_groups)

In [17]:
X_seman = pad_sequences(encoded_text, maxlen = 28, padding = 'pre')
print(X_seman)

[[  0   0   0 ... 841  12 239]
 [  0   0   0 ... 511  46 165]
 [  0   0   0 ...  51 166 106]
 ...
 [  0   0   0 ...   1   1   1]
 [  0   0   0 ...   1   1   2]
 [  0   0   0 ...   2   2   2]]


###Glove Vectors using Gensim

In [18]:
import gensim.downloader as api
glove_gensim  = api.load('glove-wiki-gigaword-100') #100 dimensional



In [19]:
# gensim_weight_matrix = np.zeros((vocab_size ,vector_size))

# def GloveVectorization(groupofwords, max_length, vector_size=100, gensim_matrix = ):
#   token = Tokenizer()
#   token.fit_on_texts(word_groups) 
#   vocab_size = len(token.word_index) + 1
#   encoded_text = token.texts_to_sequences(word_groups)
#   X = pad_sequences(encoded_text, maxlen = max_length, padding = 'pre') 
#   for word, index in token.word_index.items():
#     if index < vocab_size: 
#         if word in glove_gensim.wv.vocab:
#             gensim_weight_matrix[index] = glove_gensim[word]
#         else:
#             gensim_weight_matrix[index] = np.zeros(100)
  
#   return X, gensim_weight_matrix

In [20]:
vector_size = 100 
gensim_weight_matrix = np.zeros((1300 ,vector_size)) 
gensim_weight_matrix.shape

(1300, 100)

In [21]:
for word, index in token.word_index.items():
    if index < vocab_size: 
        if word in glove_gensim.wv.vocab:
            gensim_weight_matrix[index] = glove_gensim[word]
        else:
            gensim_weight_matrix[index] = np.zeros(100)

  This is separate from the ipykernel package so we can avoid doing imports until


### Model 2 - Multimodal of CNN & BiLSTM

In [22]:
height, width, depth = 1146, 56, 1
input_shape=(width,depth)
input_struct = Input(shape=input_shape)
layer_1 = Convolution1D(filters=32, kernel_size=3, activation='relu')(input_struct)
layer_2 = Convolution1D(filters=64, kernel_size=3, activation='relu')(layer_1)
layer_3 = Convolution1D(filters=64, kernel_size=3, activation='relu')(layer_2)
flatten_cnn = Flatten()(layer_3)

model_left = Model(input_struct, flatten_cnn)

In [31]:
EMBEDDING_DIM = 100
input_sem = Input(shape=(28))
embedding_layer = Embedding(input_dim = vocab_size, output_dim = EMBEDDING_DIM, input_length= X_seman.shape[1],
                    weights = [gensim_weight_matrix],trainable = False)(input_sem)
layer_1 = Bidirectional(LSTM(100,return_sequences=True))(embedding_layer)
layer_2 = Bidirectional(LSTM(200,return_sequences=True))(layer_1)
layer_3 = Bidirectional(LSTM(100,return_sequences=False))(layer_2)
flatten_bilstm = Flatten()(layer_3)

model_right = Model(input_sem, flatten_bilstm)

In [32]:
merge_layer = concatenate([flatten_cnn, flatten_bilstm])
dense_1 = Dense(500, activation='relu')(merge_layer)
dense_2 = Dense(100, activation='relu')(dense_1)
dense3 = Dense(20, activation='relu')(dense_2)
dense_final = Dense(1, activation='sigmoid')(dense3)

In [33]:
merged_model = Model([input_struct, input_sem], dense_final)

In [34]:
merged_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [35]:
merged_model.fit([X_struct, X_seman], y=Y_new, batch_size=100, epochs=10, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f169e5e8b50>