1. Easy

Импортируем библиотеки:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import collections
import warnings
import string
import re

from git import Repo

import os
from os import walk

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.stem import SnowballStemmer

from sklearn.exceptions import DataConversionWarning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

Будем собирать датасет через GitHub. Для начала, возьмем несколько ссылок на репозитории, в которых по большей части лежат файлы с конкретным языком из C++, Python, JavaScript:

In [5]:
links = {
    "C++":[
        "https://github.com/adah1972/geek_time_cpp",
        "https://github.com/AlohaWorld/CppKid",
        "https://github.com/leovandriel/caffe2_cpp_tutorial",
        "https://github.com/srcmake/cpp-stl-algorithms",
        "https://github.com/JamesRaynard/Multithreading-Cpp-Course",    
    ],
    "Python":[
        "https://github.com/cosmicpython/code",
        "https://github.com/fluentpython/example-code-2e",
        "https://github.com/hjwp/book-example",
        "https://github.com/trekhleb/learn-python",
        "https://github.com/davidbombal/pythonvideos",
        "https://github.com/sobolevn/python-code-disasters",
        "https://github.com/adaptives/python-examples",
    ],
    "JavaScript":[
        "https://github.com/shama/letswritecode",
        "https://github.com/mdn/js-examples",
        "https://github.com/mdn/dom-examples",
        "https://github.com/bahmutov/javascript-journey",
        "https://github.com/davidflanagan/jstdg7",
        "https://github.com/mongodb-developer/mern-stack-example",
    ]
}

file_endings = {
    "cpp": "C++",
    "hpp": "C++",
    "py": "Python",
    "js": "JavaScript"
}

Функции клонирования репозиториев и предобработки текста(стемминг и лемматизация):

In [3]:
def preprocess_text(text):
    text = re.sub(r'https?://[^\s/$.?#].[^\s]*', '', text)
    lst = text.split()
    fixed_text = []
    for word in lst:
        for Mark in string.punctuation:
            if Mark != '.':
                word = word.replace(Mark, "")
            else:
                word = word.replace(Mark, " ")
        fixed_text.append(word.lower())
    s = " ".join(fixed_text)
    return s

def applying_lemmatization(text):
    text = preprocess_text(text)
    lemmatizer = WordNetLemmatizer()
    lst = text.split()
    lemmatized_lst = [lemmatizer.lemmatize(word) for word in lst]
    return " ".join(lemmatized_lst)

def applying_stemming(text):
    text = preprocess_text(text)
    stemmer = SnowballStemmer('english')
    new_text = [stemmer.stem(word) for word in text.split()]
    return " ".join(new_text)

def clone(links):
    for lang, link in links.items():
        for repo_link in link:
            repo_name = repo_link.split("/")[-1]
            Repo.clone_from(repo_link, f"cloned_repos/{lang}/{repo_name}")

def creating_dataset(path, files, chunk, data):
    for file in files:
        if not os.path.isfile(f"{path}/{file}"): 
            continue
        else:
            end_of_name = file.split(".")[-1]
            if end_of_name in file_endings.keys():
                lang = file_endings[end_of_name]
                with open(f"{path}/{file}", "rb") as f:
                    code = f.read().decode('utf-8', errors='ignore')
                    for i in range(0, len(code), chunk):
                        data.append({"language": lang, "code": code[i:i+chunk]})


Клонируем репозитории:

In [6]:
clone(links)

GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/mdn/dom-examples cloned_repos/JavaScript/dom-examples
  stderr: 'Cloning into 'cloned_repos/JavaScript/dom-examples'...
POST git-upload-pack (175 bytes)
POST git-upload-pack (217 bytes)
error: RPC failed; curl 92 HTTP/2 stream 0 was not closed cleanly: CANCEL (err 8)
error: 6420 bytes of body are still expected
fetch-pack: unexpected disconnect while reading sideband packet
fatal: early EOF
fatal: fetch-pack: invalid index-pack output
'

Создаем датасет, преобразуем текст(удалим ссылки и пунктуацию, но вместо точек в огромной строке слов (после препроцессинга) будет пробел, ибо иначе слова будут склеиваться (по типу nltk.stem.porter)), делаем стэмминг/лемматизацию:

In [None]:
chunk = 4096

data = []

for (path, files, lst_of_files_names) in walk('cloned_repos'):
    if lst_of_files_names:
        creating_dataset(path=path, files=lst_of_files_names, chunk=chunk, data = data)

columns = ["language", "code"]
df = pd.DataFrame(data, columns=columns)

df["L_code"] = df["code"].apply(applying_lemmatization)
df["S_code"] = df["code"].apply(applying_stemming)

In [None]:
print(df)

In [None]:
print(df.groupby('language')['code'].nunique())

Будем использовать TF-IDF. Обучим, сравним результаты на метрике f1(поскольку имеем дисбаланс классов):

In [None]:
warnings.filterwarnings("ignore", category=UserWarning) #чтобы не всплывал ConvergenceWarning
x_train, x_test, y_train, y_test = train_test_split(df["L_code"], df["language"], train_size=0.8)
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
model = LogisticRegression()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
print("lemmatization:", f1_score(prediction, y_test, average="weighted"))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df["S_code"], df["language"], train_size=0.8)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
model.fit(x_train, y_train)
prediction = model.predict(x_test)
print("stemming", f1_score(prediction, y_test, average="weighted"))

Результаты получились хорошими, лемматизация показала себя лучше стэмминга. Результаты такие, возможно, потому, что данные 3 языка сильно отличаются. Также, скорее всего, повлияла правильная обработка точки(если ее удалять вместе со всей пунктуацией и не ставить вместо нее пробел, то при импортах слова будут склеиваться).

2. Medium

Делаем то же самое, что и в Easy; добавим 7 языков:

In [None]:
links = {
    "C++":[
        "https://github.com/adah1972/geek_time_cpp",
        "https://github.com/AlohaWorld/CppKid",
        "https://github.com/leovandriel/caffe2_cpp_tutorial",
        "https://github.com/srcmake/cpp-stl-algorithms",
        "https://github.com/JamesRaynard/Multithreading-Cpp-Course",    
    ],
    "Python":[
        "https://github.com/cosmicpython/code",
        "https://github.com/fluentpython/example-code-2e",
        "https://github.com/hjwp/book-example",
        "https://github.com/trekhleb/learn-python",
        "https://github.com/davidbombal/pythonvideos",
        "https://github.com/sobolevn/python-code-disasters",
        "https://github.com/adaptives/python-examples",
    ],
    "JavaScript":[
        "https://github.com/shama/letswritecode",
        "https://github.com/jhu-ep-coursera/fullstack-course4",
        "https://github.com/mdn/js-examples",
        "https://github.com/mdn/dom-examples",
        "https://github.com/bahmutov/javascript-journey",
        "https://github.com/davidflanagan/jstdg7",
        "https://github.com/mongodb-developer/mern-stack-example",
    ],
    "Java":[
        "https://github.com/BruceEckel/OnJava8-Examples",
        "https://github.com/marhan/effective-java-examples",
        "https://github.com/vladmihalcea/high-performance-java-persistence",
        "https://github.com/attacomsian/code-examples",
        "https://github.com/mark-watson/Java-AI-Book-Code",
        "https://github.com/afsalashyana/JavaFX-Tutorial-Codes"
    ],
    "Yaml":[
        "https://github.com/Tej-Singh-Rana/k8s-test",
        "https://github.com/d-led/gocd_docker_compose_example",
        "https://github.com/redhat-nfvpe/vdpa-deployment",
        "https://github.com/redhat-cop/aap_configuration_template",
        "https://github.com/89luca89/terrible",
        "https://github.com/starcraft66/infrastructure",
        "https://github.com/fititnt/infrastructure-as-code-ad-hoc-ansible"
    ],
    "Bash":[
        "https://github.com/vossenjp/bashcookbook-examples",
        "https://github.com/docusign/code-examples-bash",
        "https://github.com/yafraorg/yafra",
        "https://github.com/particleflux/kcov-bats-circleci-codeclimate",
        "https://github.com/antrosgeor/Bash-Code-Example",
        "https://github.com/jhomer-hscl/example-bash-codespace",
        "https://github.com/kevendi/codecademy_bash_examples"
    ],
    "Markdown":[
        "https://github.com/bananananacat/MBTI_Analysis",
        "https://github.com/mjbvz/vscode-fenced-code-block-grammar-injection-example",
        "https://github.com/anko/txm",
        "https://github.com/adambard/learnxinyminutes-docs",
        "https://github.com/microsoft/vscode-docs",
        "https://github.com/SAP/styleguides",
        "https://github.com/Your-First-Open-Source-Project/start-here"
    ],
    "C":[
        "https://github.com/tsnsoft/CodeLite_wxWidgets_Div2_demo",
        "https://github.com/tsnsoft/CodeBlocks_blank_prject",
        "https://github.com/jgamblin/Mirai-Source-Code",
        "https://github.com/yanfeizhang/coder-kung-fu"
        "https://github.com/tsnsoft/CodeLite_Table_demo",
        "https://github.com/tsnsoft/CodeLite_wxWidgets_demo",
        "https://github.com/tsnsoft/wxwidgets_demo-linux"
    ],
    "Kotlin":[
        "https://github.com/funfunStory/fp-kotlin-example",
        "https://github.com/Foso/Jetpack-Compose-Playground",
        "https://github.com/aws-samples/lambda-kotlin-groovy-example",
        "https://github.com/java-to-kotlin/code",
        "https://github.com/android/codelab-android-compose",
        "https://github.com/loopeer/code-reader",
        "https://github.com/android/codelab-android-paging"
    ],
    "Haskell":[
        "https://github.com/bravit/hid-examples",
        "https://github.com/simonmar/parconc-examples",
        "https://github.com/mark-watson/haskell_tutorial_cookbook_examples",
        "https://github.com/palf/haskell-sdl2-examples",
        "https://github.com/reflex-frp/reflex-examples",
        "https://github.com/thma/WhyHaskellMatters",
        "https://github.com/haskell/stylish-haskell"
    ]
}

file_endings = {
    "cpp": "C++",
    "hpp": "C++",
    "py": "Python",
    "js": "JavaScript",
    "java": "Java",
    "yaml": "Yaml",
    "sh": "Bash",
    "md": "Markdown",
    "h": "C",
    "c": "C",
    "kt": "Kotlin",
    "hs": "Haskell"
}

Клонируем репозитории тех языков, для которых не были склонированы в Easy:

In [None]:
new_links = {lang: code for lang, code in links.items() if lang not in ["C++", "Python", "JavaScript"]}
clone(new_links)

Далее все аналогично 1 заданию:

In [None]:
chunk = 4096

data = []

for (path, files, lst_of_files_names) in walk('cloned_repos'):
    if lst_of_files_names:
        creating_dataset(path=path, files=lst_of_files_names, chunk=chunk, data = data)

columns = ["language", "code"]
df = pd.DataFrame(data, columns=columns)

df["L_code"] = df["code"].apply(applying_lemmatization)
df["S_code"] = df["code"].apply(applying_stemming)
print(df)

In [None]:
print(df.groupby('language')['code'].nunique())

In [None]:
warnings.filterwarnings("ignore", category=UserWarning) #чтобы не всплывал ConvergenceWarning
x_train, x_test, y_train, y_test = train_test_split(df["L_code"], df["language"], train_size=0.8)
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
model = LogisticRegression()
model.fit(x_train, y_train)
prediction = model.predict(x_test)
print("lemmatization:", f1_score(prediction, y_test, average="weighted"))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df["S_code"], df["language"], train_size=0.8)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)
model.fit(x_train, y_train)
prediction = model.predict(x_test)
print("stemming", f1_score(prediction, y_test, average="weighted"))

Результаты: