# Select File Extensions

In [1]:
import pandas as pd
pd.set_option("display.max_rows", None)

Link to Gist: https://gist.github.com/ppisarczyk/43962d06686722d26d176fad46879d41

In [2]:
df = pd.read_json("https://gist.githubusercontent.com/ppisarczyk/43962d06686722d26d176fad46879d41/raw/211547723b4621a622fc56978d74aa416cbd1729/Programming_Languages_Extensions.json")

There 396 programming languages:

In [3]:
len(df)

396

## Look at Extension Collisions

In [4]:
df_coll = df.explode("extensions").groupby("extensions")['name'].apply(list).reset_index()
df_coll["n"] = df_coll["name"].apply(len)
df_coll[df_coll["n"]>1].to_json("extension_clashes.json", orient="records")

In [5]:
len(df_coll[df_coll["n"]>1])

65

In [6]:
df_coll[df_coll["n"]>1].head()

Unnamed: 0,extensions,name,n
49,.asc,"[AGS Script, AsciiDoc, Public Key]",3
72,.b,"[Brainfuck, Limbo]",2
77,.bb,"[BitBake, BlitzBasic]",2
80,.bf,"[Brainfuck, HyPhy]",2
87,.brd,"[Eagle, KiCad]",2


## Resolve conflicts

In [7]:
langs_to_drop = ["HyPhy"]

exts_to_drop = [".asc", ".b", ".cgi", ".ch", ".cls", ".d", ".es", ".fcgi",
                ".fr", ".gml", ".gs", ".inc", ".l", ".m", ".mod", ".moo",
                ".ms", ".n", ".ncl", ".nl", ".pluginspec", ".pp", ".pro",
                ".sc", ".tst", ".v", '.sublime-build', '.sublime-commands', 
                '.sublime-completions', '.sublime-keymap', '.sublime-macro',
                '.sublime-menu', '.sublime-mousemap', '.sublime-project',
                '.sublime-settings', '.sublime-theme', '.sublime-workspace',
                '.sublime_metrics', '.sublime_session']

exts_to_lang_map = [
    (".bb", "BitBake"),
    (".brd", "Eagle"),
    (".cake", "C#"),
    (".cl", "OpenCL"),
    (".cp", "C++"),
    (".cs", "C#"),
    (".ecl", "ECL"),
    (".f", "FORTRAN"),
    (".for", "FORTRAN"),
    (".frag", "GLSL"),
    (".fs", "F#"),
    (".fx", "Flux"),
    (".g", "G-code"),
    (".gd", "GDScript"),
    (".h", "C"),
    (".hh", "C++"),
    (".j", "Jasmin"),
    (".lisp", "Common Lisp"),
    (".ls", "LiveScript"),
    (".lsp", "Common Lisp"),
    (".mm", "Objective-C++"),
    (".nb", "Mathematica"),
    (".php", "PHP"),
    (".pm", "Perl"),
    (".pl", "Perl"),
    (".pod", "Pod"),
    (".r", "R"),
    (".rpy", "Ren'Py"),
    (".rs", "Rust"),
    (".sch", "Eagle"),
    (".sls", "SaltStack"),
    (".sql", "SQL"),
    (".st", "Smalltalk"),
    (".t", "Perl"),
    (".ts", "TypeScript"),
    (".tsx", "TypeScript"),
    (".vhost", "ApacheConf"),
]

langs_to_map = {
    "M4Sugar": "M4",
    "PLSQL": "SQL",
    "PLpgSQL": "SQL",
    "SQLPL": "SQL",
}

langs_to_add = {
    "Maple": [".mpl"],
    "Octave": [".oct"],
    "WebAssembly": [".wat"],
    "Solidity": [".sol"],
    "Csound": [".csd"],
    "Zig": [".zig"],
    "C++": ['.C', '.H'],
    "Dockerfile": ['Dockerfile'],
    "Makefile": ['Makefile'],
    
}

## Clean list

Add languages that are not in original list:

In [8]:
df_add = pd.DataFrame({"name": langs_to_add.keys(), "extensions": langs_to_add.values()})
df_add["type"] = "programming"
df = df.append(df_add)

Make each extension (~1000) a row:

In [9]:
df_ext = df.explode("extensions")
len(df_ext)

1018

Remove languages without an extension:

In [10]:
df_ext = df_ext.dropna(subset=["extensions"])
df_ext[df_ext["extensions"].isna()]

Unnamed: 0,name,type,extensions


In [11]:
df_ext.head()

Unnamed: 0,name,type,extensions
0,ABAP,programming,.abap
1,AGS Script,programming,.asc
1,AGS Script,programming,.ash
2,AMPL,programming,.ampl
2,AMPL,programming,.mod


1. Rename some languages
2. Remove some extension
3. Remove some languages

In [12]:
df_ext = df_ext.replace({"name": langs_to_map})
df_ext = df_ext[~df_ext["extensions"].isin(exts_to_drop)]
df_ext = df_ext[~df_ext["name"].isin(langs_to_drop)]


Keep only extensions that with languages that are in the mapping:

In [13]:
df_ext["drop"] = False
for ex, lang in exts_to_lang_map:
    df_ext["drop"] = df_ext.apply(lambda x: (x["extensions"]==ex and x["name"]!=lang) or x["drop"],  axis=1)

Drops 45 extensions:

In [14]:
df_ext["drop"].sum()

45

In [15]:
df_ext = df_ext[~df_ext["drop"]]

After the language name mapping there can be duplicates:

In [16]:
df_ext = df_ext.drop_duplicates(subset=["name", "extensions"])

Check if an extension appears more than once:

In [17]:
from collections import Counter
Counter(df_ext["extensions"].to_list()).most_common(5)

[('.abap', 1), ('.ash', 1), ('.ampl', 1), ('.g4', 1), ('.apib', 1)]

Number of extensions:

In [18]:
len(df_ext)

870

Merge all extensions such that there is one row per language:

In [19]:
df_result = df_ext.groupby("name")['extensions'].apply(list).reset_index()

There are in total 370 extensions:

In [20]:
len(df_result)

370

In [21]:
df_result.head()

Unnamed: 0,name,extensions
0,ABAP,[.abap]
1,AGS Script,[.ash]
2,AMPL,[.ampl]
3,ANTLR,[.g4]
4,API Blueprint,[.apib]


## Save

Save list as JSON:

In [22]:
import json

result_dict = {k: v for k, v in zip(df_result["name"], df_result["extensions"])}

with open("programming-languages-to-file-extensions.json", "w") as outfile:
    json.dump(result_dict, outfile)

## Compare list to v1.0

In [23]:
langs_30 = {
    "Assembly": [".asm"],
    "Batchfile": [".bat", ".cmd"],
    "C": [".c", ".h"],
    "C#": [".cs"],
    "C++": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
    "CMake": [".cmake"],
    "CSS": [".css"],
    "Dockerfile": [".dockerfile", "Dockerfile"],
    "FORTRAN": ['.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp'],
    "Go": [".go"],
    "Haskell": [".hs"],
    "HTML":[".html"],
    "Java": [".java"],
    "JavaScript": [".js"],
    "Julia": [".jl"],
    "Lua": [".lua"],
    "Makefile": ["Makefile"],
    "Markdown": [".md", ".markdown"],
    "PHP": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
    "Perl": [".pl", ".pm", ".pod", ".perl"],
    "PowerShell": ['.ps1', '.psd1', '.psm1'],
    "Python": [".py"],
    "Ruby": [".rb"],
    "Rust": [".rs"],
    "SQL": [".sql"],
    "Scala": [".scala"],
    "Shell": [".sh", ".bash", ".command", ".zsh"],
    "TypeScript": [".ts", ".tsx"],
    "TeX": [".tex"],
    "Visual Basic": [".vb"]
}

In [24]:
for key in langs_30:
    print(key, [r for r in result_dict[key] if r not in langs_30[key]])

Assembly ['.a51', '.nasm']
Batchfile []
C ['.cats', '.idc', '.w']
C# ['.cake', '.cshtml', '.csx']
C++ ['.cp', '.cxx', '.hxx', '.inl', '.ipp', '.tcc', '.tpp']
CMake ['.cmake.in']
CSS []
Dockerfile []
FORTRAN []
Go []
Haskell ['.hsc']
HTML ['.htm', '.html.hl', '.xht', '.xhtml']
Java []
JavaScript ['._js', '.bones', '.es6', '.jake', '.jsb', '.jscad', '.jsfl', '.jsm', '.jss', '.njs', '.pac', '.sjs', '.ssjs', '.xsjs', '.xsjslib']
Julia []
Lua ['.nse', '.pd_lua', '.rbxs', '.wlua']
Makefile ['.mak', '.mk', '.mkfile']
Markdown ['.mkd', '.mkdn', '.mkdown', '.ron']
PHP ['.aw', '.ctp']
Perl ['.al', '.ph', '.plx', '.psgi', '.t']
PowerShell []
Python ['.bzl', '.gyp', '.lmi', '.pyde', '.pyp', '.pyt', '.pyw', '.tac', '.wsgi', '.xpy']
Ruby ['.builder', '.gemspec', '.god', '.irbrc', '.jbuilder', '.mspec', '.podspec', '.rabl', '.rake', '.rbuild', '.rbw', '.rbx', '.ru', '.ruby', '.thor', '.watchr']
Rust ['.rs.in']
SQL ['.pls', '.pck', '.pkb', '.pks', '.plb', '.plsql', '.cql', '.ddl', '.prc', '.tab', '.ud

In [25]:
for key in langs_30:
    print(key, [r for r in langs_30[key] if r not in result_dict[key]])

Assembly []
Batchfile []
C []
C# []
C++ []
CMake []
CSS []
Dockerfile []
FORTRAN []
Go []
Haskell []
HTML []
Java []
JavaScript []
Julia []
Lua []
Makefile []
Markdown []
PHP []
Perl ['.pod']
PowerShell []
Python []
Ruby []
Rust []
SQL []
Scala []
Shell []
TypeScript []
TeX []
Visual Basic []
