Replace file detection with identify

asottile · Jan 21, 2018 · c5290f2 · c5290f2
1 parent 6d3c4a0
commit c5290f2
Show file tree

Hide file tree

Showing 8 changed files with 50 additions and 94 deletions.
diff --git a/git_code_debt/metrics/common.py b/git_code_debt/metrics/common.py
@@ -1,56 +1,12 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
-
-PYTHON = 'Python'
-YAML = 'Yaml'
-TEMPLATE = 'Template'
-CSS = 'Css'
-MAKO_TEMPLATE = 'Mako_Template'
-JAVASCRIPT = 'Javascript'
-JAVA = 'Java'
-ILLUSTRATOR = 'Illustrator'
-HTML = 'Html'
-CCPP = 'C_C++'
-TEXT = 'Text'
-SQL = 'SQL'
-
-
-# Maps a set of file extensions to a nice name.
-# Updating this will cause that file type to be tracked for LinesOfCode metric.
-FILE_TYPE_MAP = {
-    b'.py': PYTHON,
-
-    b'.yaml': YAML,
-    b'.yml': YAML,
-
-    b'.css': CSS,
-    b'.scss': CSS,
-
-    b'.tmpl': TEMPLATE,
-
-    b'.mako': MAKO_TEMPLATE,
-
-    b'.js': JAVASCRIPT,
-
-    b'.java': JAVA,
-
-    b'.ai': ILLUSTRATOR,
-
-    b'.htm': HTML,
-    b'.html': HTML,
-
-    b'.h': CCPP,
-    b'.c': CCPP,
-    b'.cpp': CCPP,
-
-    b'.md': TEXT,
-    b'.rst': TEXT,
-    b'.csv': TEXT,
-    b'.log': TEXT,
-    b'.json': TEXT,
-    b'.xml': TEXT,
-    b'.txt': TEXT,
-
-    b'.sql': SQL,
-}
+from identify import identify
+
+UNKNOWN = 'unknown'
+IGNORED_TAGS = frozenset((
+    identify.DIRECTORY, identify.SYMLINK, identify.FILE,
+    identify.EXECUTABLE, identify.NON_EXECUTABLE,
+    identify.TEXT, identify.BINARY,
+))
+ALL_TAGS = frozenset((identify.ALL_TAGS - IGNORED_TAGS) | {UNKNOWN})
diff --git a/git_code_debt/metrics/curse.py b/git_code_debt/metrics/curse.py
@@ -3,9 +3,12 @@
 
 import collections
 
+from identify import identify
+
 from git_code_debt.metric import Metric
 from git_code_debt.metrics.base import DiffParserBase
-from git_code_debt.metrics.common import FILE_TYPE_MAP
+from git_code_debt.metrics.common import ALL_TAGS
+from git_code_debt.metrics.common import UNKNOWN
 from git_code_debt.metrics.curse_words import word_list
 
 
@@ -34,20 +37,19 @@ def get_metrics_from_stat(self, _, file_diff_stats):
             total_curses = total_curses + curses_changed
 
             # Track by file extension -> type mapping
-            file_type = FILE_TYPE_MAP.get(file_diff_stat.extension, 'unknown')
-            curses_by_file_type[file_type] += curses_changed
+            filename = file_diff_stat.filename.decode('UTF-8')
+            tags = identify.tags_from_filename(filename) or {UNKNOWN}
+
+            for tag in tags:
+                curses_by_file_type[tag] += curses_changed
 
         # Yield overall metric and one per type of expected mapping types
         yield Metric('TotalCurseWords', total_curses)
-        for file_type in set(FILE_TYPE_MAP.values()) | {'unknown'}:
-            curses_changed = curses_by_file_type.get(file_type, 0)
-            yield Metric(
-                'TotalCurseWords_{}'.format(file_type),
-                curses_changed,
-            )
+        for tag in ALL_TAGS:
+            curses_changed = curses_by_file_type[tag]
+            yield Metric('TotalCurseWords_{}'.format(tag), curses_changed)
 
     def get_possible_metric_ids(self):
         return ['TotalCurseWords'] + [
-            'TotalCurseWords_{}'.format(file_type)
-            for file_type in set(FILE_TYPE_MAP.values()) | {'unknown'}
+            'TotalCurseWords_{}'.format(tag) for tag in ALL_TAGS
         ]
diff --git a/git_code_debt/metrics/lines.py b/git_code_debt/metrics/lines.py
@@ -3,9 +3,12 @@
 
 import collections
 
+from identify import identify
+
 from git_code_debt.metric import Metric
 from git_code_debt.metrics.base import DiffParserBase
-from git_code_debt.metrics.common import FILE_TYPE_MAP
+from git_code_debt.metrics.common import ALL_TAGS
+from git_code_debt.metrics.common import UNKNOWN
 
 
 class LinesOfCodeParser(DiffParserBase):
@@ -24,21 +27,19 @@ def get_metrics_from_stat(self, _, file_diff_stats):
             # Track total overall
             total_lines += lines_changed
 
-            # Track by file extension -> type mapping
-            file_type = FILE_TYPE_MAP.get(file_diff_stat.extension, 'unknown')
-            lines_by_file_type[file_type] += lines_changed
+            filename = file_diff_stat.filename.decode('UTF-8')
+            tags = identify.tags_from_filename(filename) or {UNKNOWN}
+
+            for tag in tags:
+                lines_by_file_type[tag] += lines_changed
 
         # Yield overall metric and one per type of expected mapping types
         yield Metric('TotalLinesOfCode', total_lines)
-        for file_type in set(FILE_TYPE_MAP.values()) | {'unknown'}:
-            lines_changed = lines_by_file_type.get(file_type, 0)
-            yield Metric(
-                'TotalLinesOfCode_{}'.format(file_type),
-                lines_changed,
-            )
+        for tag in ALL_TAGS:
+            lines_changed = lines_by_file_type[tag]
+            yield Metric('TotalLinesOfCode_{}'.format(tag), lines_changed)
 
     def get_possible_metric_ids(self):
         return ['TotalLinesOfCode'] + [
-            'TotalLinesOfCode_{}'.format(file_type)
-            for file_type in set(FILE_TYPE_MAP.values()) | {'unknown'}
+            'TotalLinesOfCode_{}'.format(tag) for tag in ALL_TAGS
         ]
diff --git a/metric_config.yaml b/metric_config.yaml
@@ -14,13 +14,9 @@
 # NOTE: metrics and metric_expressions may be omitted
 
 Groups:
-    - Cheetah:
-        metrics: ['TotalLinesOfCode_Template']
-        metric_expressions:
-            - ^.*Cheetah.*$
     - Python:
         metric_expressions:
-            - ^.*Python.*$
+            - (?i)^.*Python.*$
     - CurseWords:
         metric_expressions:
             - ^TotalCurseWords.*$
@@ -50,8 +46,8 @@ CommitLinks:
 # These denote the metrics to show in the widget.
 WidgetMetrics:
     TotalLinesOfCode: {}
-    TotalLinesOfCode_Css: {}
-    TotalLinesOfCode_Python: {}
-    TotalLinesOfCode_Javascript: {}
-    TotalLinesOfCode_Text: {}
-    TotalLinesOfCode_Yaml: {}
+    TotalLinesOfCode_css: {}
+    TotalLinesOfCode_python: {}
+    TotalLinesOfCode_javascript: {}
+    TotalLinesOfCode_plain-text: {}
+    TotalLinesOfCode_yaml: {}
diff --git a/setup.py b/setup.py
@@ -35,6 +35,7 @@
     },
     install_requires=[
         'flask',
+        'identify',
         'jsonschema',
         'mako',
         'pyyaml',

diff --git a/tests/metrics/curse_test.py b/tests/metrics/curse_test.py
@@ -11,8 +11,8 @@ def test_curse_words_parser():
     parser = CurseWordsParser()
     input_stats = [
         FileDiffStat(
-            b'templates/foo.tmpl',
-            [b'#man seriously, fuck cheetah'],
+            b'some/file.rb',
+            [b'#man seriously, fuck ruby'],
             [],
             None,
         ),
@@ -24,5 +24,5 @@ def test_curse_words_parser():
         ),
     ]
     metrics = list(parser.get_metrics_from_stat(Commit.blank, input_stats))
-    assert Metric('TotalCurseWords_Template', 1) in metrics
-    assert Metric('TotalCurseWords_Python', 0) in metrics
+    assert Metric('TotalCurseWords_ruby', 1) in metrics
+    assert Metric('TotalCurseWords_python', 0) in metrics
diff --git a/tests/metrics/lines_test.py b/tests/metrics/lines_test.py
@@ -17,8 +17,8 @@ def test_lines_of_code_parser():
 
     expected_value = {
         'TotalLinesOfCode': 3,
-        'TotalLinesOfCode_Python': 1,
-        'TotalLinesOfCode_Yaml': 2,
+        'TotalLinesOfCode_python': 1,
+        'TotalLinesOfCode_yaml': 2,
     }
     for metric in metrics:
         assert metric.value == expected_value.get(metric.name, 0)
diff --git a/tests/server/servlets/widget_test.py b/tests/server/servlets/widget_test.py
@@ -47,11 +47,11 @@ def test_widget_data(server):
 
 def test_widget_data_multiple_values(server):
     with metrics_enabled(
-        {'TotalLinesOfCode': {}, 'TotalLinesOfCode_Text': {}},
+        {'TotalLinesOfCode': {}, 'TotalLinesOfCode_plain-text': {}},
     ):
         response = server.client.post(
             flask.url_for('widget.data'),
             data={'diff': file_diff_stat_test.SAMPLE_OUTPUT},
         )
     response_pq = pyquery.PyQuery(response.json['metrics'])
-    assert 'TotalLinesOfCode_Text' in response_pq.text()
+    assert 'TotalLinesOfCode_plain-text' in response_pq.text()