In [1]:
from detection.annotate_code import parse_code, annotate_code, get_numerical_data_from_code

In [2]:
code_str= """
    /**
     * This is a doc comment
     */
    public class Foo {
      // A regular comment
      private String name = "TreeSitter";

      /** constructor doc */
      public Foo(String name) {
        this.name = name;  // inline comment
      }

      public int add(int a, int b) {
        return a + b;
      }
    }
"""

parse_code(code_str)

[('comment.block', 5, 45, '/**\n     * This is a doc comment\n     */'),
 ('comment.block', 145, 167, '/** constructor doc */'),
 ('class.name', 63, 66, 'Foo'),
 ('comment.line', 75, 95, '// A regular comment'),
 ('comment.line', 227, 244, '// inline comment'),
 ('variable.name', 117, 121, 'name'),
 ('string.content', 125, 135, 'TreeSitter'),
 ('method.name', 271, 274, 'add')]

In [3]:
annotations = annotate_code(code_str)

for index in range(len(annotations)):
    print(f"Index: {index}, Annotation: {annotations[index]}")


Index: 0, Annotation: -1
Index: 1, Annotation: -1
Index: 2, Annotation: -1
Index: 3, Annotation: -1
Index: 4, Annotation: -1
Index: 5, Annotation: -1
Index: 6, Annotation: -1
Index: 7, Annotation: -1
Index: 8, Annotation: -1
Index: 9, Annotation: -1
Index: 10, Annotation: -1
Index: 11, Annotation: -1
Index: 12, Annotation: -1
Index: 13, Annotation: -1
Index: 14, Annotation: -1
Index: 15, Annotation: -1
Index: 16, Annotation: en
Index: 17, Annotation: en
Index: 18, Annotation: en
Index: 19, Annotation: en
Index: 20, Annotation: -1
Index: 21, Annotation: en
Index: 22, Annotation: en
Index: 23, Annotation: -1
Index: 24, Annotation: en
Index: 25, Annotation: -1
Index: 26, Annotation: en
Index: 27, Annotation: en
Index: 28, Annotation: en
Index: 29, Annotation: -1
Index: 30, Annotation: en
Index: 31, Annotation: en
Index: 32, Annotation: en
Index: 33, Annotation: en
Index: 34, Annotation: en
Index: 35, Annotation: en
Index: 36, Annotation: en
Index: 37, Annotation: -1
Index: 38, Annotation:

In [4]:
get_numerical_data_from_code(code_str, annotate_code(code_str))

{'lang_identifiers': [],
 'lang_max_identifiers': [],
 'lang_freq_identifiers': [],
 'lang_comments': ['en'],
 'lang_max_comments': [40],
 'lang_freq_comments': [4],
 'lang_strings': [],
 'lang_max_strings': [],
 'lang_freq_strings': []}

In [4]:
len(code_str)

327

In [5]:
code_str[163]

'c'

In [6]:
first_code = """
    /*
    * Classe de test multilingue
    * Cette classe démontre:
    *   – identifiants en anglais (userName, getUserAge)
    *   – identifiants en français (étatCompte)
    *   – littéraux de chaîne en espagnol (\"Hola Mundo\")
    */

    public class CompteBancaire {
        private String userName;      // English identifier
        private double étatCompte;    // French identifier (« étatCompte »)

        public CompteBancaire(String initialUser, double soldeInitial) {
            this.userName = initialUser;
            this.étatCompte = soldeInitial;
        }

        // Método en español para depositar dinero
        public void depositar(double monto) {
            étatCompte += monto;
            System.out.println("Ha depositado: " + monto + " EUR");
        }

        // Method in English to withdraw
        public boolean withdraw(double amount) {
            if (amount <= étatCompte) {
                étatCompte -= amount;
                return true;
            }
            return false;
        }

        // Récupère l’âge de l’utilisateur
        public int getUserAge() {
            // on suppose un âge fixe pour l’exemple
            int edad = 30;  // mezcla de español
            return edad;
        }
    }
"""

In [7]:
parse_code(first_code)

[('comment.block',
  5,
  238,
  '/*\n    * Classe de test multilingue\n    * Cette classe démontre:\n    *   – identifiants en anglais (userName, getUserAge)\n    *   – identifiants en français (étatCompte)\n    *   – littéraux de chaîne en espagnol ("Hola Mundo")\n    */'),
 ('class.name', 257, 271, 'CompteBancaire'),
 ('variable.name', 297, 305, 'userName'),
 ('variable.name', 357, 367, 'étatCompte'),
 ('variable.name', 1183, 1187, 'edad'),
 ('comment.line', 312, 333, '// English identifier'),
 ('comment.line', 797, 829, '// Method in English to withdraw'),
 ('comment.line', 1195, 1215, '// mezcla de español'),
 ('comment.line', 1126, 1166, '// on suppose un âge fixe pour l’exemple'),
 ('comment.line', 1045, 1079, '// Récupère l’âge de l’utilisateur'),
 ('comment.line', 588, 630, '// Método en español para depositar dinero'),
 ('comment.line', 372, 409, '// French identifier (« étatCompte »)'),
 ('method.name', 651, 660, 'depositar'),
 ('method.name', 1099, 1109, 'getUserAge'),
 ('m

In [8]:
first_code[264:278]

'ancaire {\n    '

In [8]:
annotations = annotate_code(first_code)

for index in range(len(annotations)):
    print(f"Index: {index}, Annotation: {annotations[index]}")

Index: 0, Annotation: -1
Index: 1, Annotation: -1
Index: 2, Annotation: -1
Index: 3, Annotation: -1
Index: 4, Annotation: -1
Index: 5, Annotation: -1
Index: 6, Annotation: -1
Index: 7, Annotation: -1
Index: 8, Annotation: -1
Index: 9, Annotation: -1
Index: 10, Annotation: -1
Index: 11, Annotation: -1
Index: 12, Annotation: -1
Index: 13, Annotation: -1
Index: 14, Annotation: fr
Index: 15, Annotation: fr
Index: 16, Annotation: fr
Index: 17, Annotation: fr
Index: 18, Annotation: fr
Index: 19, Annotation: fr
Index: 20, Annotation: -1
Index: 21, Annotation: fr
Index: 22, Annotation: fr
Index: 23, Annotation: -1
Index: 24, Annotation: fr
Index: 25, Annotation: fr
Index: 26, Annotation: fr
Index: 27, Annotation: fr
Index: 28, Annotation: -1
Index: 29, Annotation: fr
Index: 30, Annotation: fr
Index: 31, Annotation: fr
Index: 32, Annotation: fr
Index: 33, Annotation: fr
Index: 34, Annotation: fr
Index: 35, Annotation: fr
Index: 36, Annotation: fr
Index: 37, Annotation: fr
Index: 38, Annotation:

In [10]:
len(first_code)

1257

In [9]:
get_numerical_data_from_code(first_code, annotate_code(first_code))

{'lang_identifiers': ['en', 'fr'],
 'lang_max_identifiers': [10, 14],
 'lang_freq_identifiers': [3, 1],
 'lang_comments': ['fr', 'en', 'es'],
 'lang_max_comments': [233, 32, 42],
 'lang_freq_comments': [4, 2, 2],
 'lang_strings': ['en'],
 'lang_max_strings': [15],
 'lang_freq_strings': [1]}

In [10]:
new_code_str = """
    package run.halo.app;\n\nimport static org.assertj.core.api.Assertions.assertThat;\n\nimport java.net.URI;\nimport org.junit.jupiter.api.Test;\nimport org.springframework.web.bind.annotation.RequestMapping;\nimport org.springframework.web.bind.annotation.RestController;\nimport org.springframework.web.method.HandlerTypePredicate;\n\n/**\n * Test case for api path prefix predicate.\n *\n * @author guqing\n * @date 2022-04-13\n */\npublic class PathPrefixPredicateTest {\n\n    @Test\n    public void prefixPredicate() {\n        boolean falseResult = HandlerTypePredicate.forAnnotation(RestController.class)\n            .and(HandlerTypePredicate.forBasePackage(Application.class.getPackageName()))\n            .test(getClass());\n        assertThat(falseResult).isFalse();\n\n        boolean result = HandlerTypePredicate.forAnnotation(RestController.class)\n            .and(HandlerTypePredicate.forBasePackage(Application.class.getPackageName()))\n            .test(TestController.class);\n        assertThat(result).isTrue();\n    }\n\n    @RestController("controller-for-test")\n    @RequestMapping("/test-prefix")\n    class TestController {\n\n    }\n\n\n    @Test\n    void urlTest() {\n        URI uri = URI.create("https:///path");\n        System.out.println(uri);\n    }\n}\n
"""

In [11]:
parse_code(new_code_str)

[('comment.block',
  330,
  422,
  '/**\n * Test case for api path prefix predicate.\n *\n * @author guqing\n * @date 2022-04-13\n */'),
 ('class.name', 436, 459, 'PathPrefixPredicateTest'),
 ('class.name', 1107, 1121, 'TestController'),
 ('method.name', 489, 504, 'prefixPredicate'),
 ('method.name', 1152, 1159, 'urlTest'),
 ('variable.name', 525, 536, 'falseResult'),
 ('variable.name', 1176, 1179, 'uri'),
 ('variable.name', 777, 783, 'result'),
 ('string.content', 1039, 1058, 'controller-for-test'),
 ('string.content', 1194, 1207, 'https:///path'),
 ('string.content', 1082, 1094, '/test-prefix')]

In [12]:
annotations = annotate_code(new_code_str)

In [8]:
annotations[330:422]

['-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 'en',
 'en',
 'en',
 'en',
 '-1',
 'en',
 'en',
 'en',
 'en',
 '-1',
 'en',
 'en',
 'en',
 '-1',
 'en',
 'en',
 'en',
 '-1',
 'en',
 'en',
 'en',
 'en',
 '-1',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 '-1',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 '-1',
 'en',
 'en',
 'en',
 'en',
 'en',
 'en',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 'en',
 'en',
 'en',
 'en',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1']

In [13]:
get_numerical_data_from_code(new_code_str, annotate_code(new_code_str))

{'lang_identifiers': ['en'],
 'lang_max_identifiers': [14],
 'lang_freq_identifiers': [1],
 'lang_comments': ['en'],
 'lang_max_comments': [92],
 'lang_freq_comments': [1],
 'lang_strings': ['en'],
 'lang_max_strings': [13],
 'lang_freq_strings': [1]}