In [1]:
import pandas as pd
import pickle

In [2]:
df = pd.read_csv('../Dataset/DataClass.csv')

In [3]:
df.sample(5)

Unnamed: 0,Code,Smell
833,public class Grade {\r\n private int gradeI...,0
130,public class User {\r\n private String user...,0
1000,public class Ticket {\n private int ticketI...,1
588,public class Book {\n private int bookId;\n...,1
547,public class Promotion {\r\n private int pr...,0


In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()


In [5]:
df['Smell'] = encoder.fit_transform(df['Smell'])
df.head()

Unnamed: 0,Code,Smell
0,public class Person {\n private String name...,1
1,public class Person {\r\n private String na...,0
2,public class Person {\n private String name...,1
3,public class Person {\r\n private String na...,0
4,public class Address {\n private String str...,1


In [6]:
df['Smell'].value_counts()

Smell
0    524
1    522
Name: count, dtype: int64

In [7]:
!pip install javalang

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 23.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import javalang
def parse_code_to_ast(code):
    try:
        return javalang.parse.parse_member_signature(code)
    except (javalang.parser.JavaSyntaxError, javalang.tokenizer.LexerError):
        return None

In [9]:
def is_getter_or_setter(method_node):
    method_name = method_node.name
    return (method_name.startswith('get') or
            method_name.startswith('set') or
            method_name.startswith('is')) and len(method_node.parameters) <= 1

In [10]:
def is_data_class(class_node):
    method_count = 0
    getter_setter_count = 0

    for member in class_node.body:
        if isinstance(member, javalang.tree.MethodDeclaration):
            method_count += 1
            if is_getter_or_setter(member):
                getter_setter_count += 1

    return method_count, getter_setter_count


In [11]:
def find_data_classes(code):
    tree = parse_code_to_ast(code)
    if tree is None:
        return 0, 0

    for path, node in tree.filter(javalang.tree.ClassDeclaration):
        method_count, getter_setter_count = is_data_class(node)

    return method_count, getter_setter_count

In [12]:
df['method'], df['getSet'] = zip(*df['Code'].apply(find_data_classes))

In [13]:
print(df["method"],df["getSet"])

0       4
1       5
2       4
3       5
4       8
       ..
1041    7
1042    6
1043    7
1044    6
1045    7
Name: method, Length: 1046, dtype: int64 0       4
1       4
2       4
3       4
4       8
       ..
1041    6
1042    6
1043    6
1044    6
1045    6
Name: getSet, Length: 1046, dtype: int64


In [14]:
df.head(10)

Unnamed: 0,Code,Smell,method,getSet
0,public class Person {\n private String name...,1,4,4
1,public class Person {\r\n private String na...,0,5,4
2,public class Person {\n private String name...,1,4,4
3,public class Person {\r\n private String na...,0,5,4
4,public class Address {\n private String str...,1,8,8
5,public class Address {\r\n private String s...,0,9,8
6,public class Order {\n private String order...,1,12,12
7,public class Order {\r\n private String ord...,0,13,12
8,public class Rectangle {\n private double w...,0,5,4
9,public class Rectangle {\r\n private double...,0,6,4


In [15]:
X = df[["method", "getSet"]]
y = df['Smell'].values

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
y_pred1 = mnb.predict(X_test)
print(y_pred1)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

[0 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1
 1 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0
 0 1 0 1 0 1 1 0 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1 1 1
 0 1 1 1 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 0 1 1 0 0 1 1
 0 1 0 1 1 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1
 0 1 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 1 0 0 0]
0.9904761904761905
[[105   2]
 [  0 103]]
0.9809523809523809


In [21]:
java_code = """
public class ServerConfig {
    private int port = 8080;
    public int getPort() {
        return port;
    }
    public void setPort(int port) {
        this.port = port;
    }
     public void printNotDataClass() {
        System.out.println("This is not a data class.");
    }
}
"""
data_classes = find_data_classes(java_code)
print("Data Classes found:", data_classes)

Data Classes found: (3, 2)


In [22]:
with open('../Pkl File/data_class.pkl', 'wb') as model_file:
    pickle.dump(mnb, model_file)