# AI4Code: Language identification for notebooks by markdown

In [None]:
import re
import fasttext
from nltk.stem import WordNetLemmatizer

In [None]:
NUM_PREDICTION_MARKDOWNS = 5 # median of count markdown cells // 2
PROCESSING_DATA_PATH = '../../data/preprocessed/'

## Processing text function

In [None]:
stemmer = WordNetLemmatizer()


def preprocess_text(document):
    # Remove all the special characters
    document = re.sub(r"\W", " ", str(document))

    # remove all single characters
    document = re.sub(r"\s+[a-zA-Z]\s+", " ", document)

    # Remove new line simbols for language identification
    document = document.replace("\n", " ")

    # Remove html tags
    document = re.sub(r"<.*?>", "", document)

    # Remove single characters from the start
    document = re.sub(r"\^[a-zA-Z]\s+", " ", document)

    # Substituting multiple spaces with single space
    document = re.sub(r"\s+", " ", document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r"^b\s+", "", document)

    # Converting to Lowercase
    document = document.lower()

    # remove digits
    document = re.sub(r"[0-9]+", "", document)

    # Lemmatization
    #tokens = document.split()
    #tokens = [stemmer.lemmatize(word) for word in tokens]
    #tokens = [word for word in tokens if len(word) > 2]

    #preprocessed_text = " ".join(tokens)
    return document


In [None]:
class LanguageIdentification:
    def __init__(self):
        pretrained_lang_model = "../../data/pretrained_models/lid.176.bin"
        self.model = fasttext.load_model(pretrained_lang_model)

    def predict_lang(self, text):
        text = preprocess_text(text)
        predictions = self.model.predict(text, k=1)  # returns top 1 matching languages
        return predictions


language_ident = LanguageIdentification()

In [None]:
lang_df = train[['id', 'cell', 'cell_type', 'source']]
lang_df = lang_df[lang_df.cell_type == 'markdown']

notebooks_ids = lang_df.id.unique()
notebook_id = []
notebook_lang = []
for notebook_id in tqdm(notebooks_ids):
    markdown_source = lang_df[lang_df.id == notebook_id].source.to_list()[:NUM_PREDICTION_MARKDOWNS]
    markdown_source = " ".join(markdown_source)
    notebook_lang.append(language_ident.predict_lang(markdown_source))

In [None]:
notebooks_lang_df = pd.DataFrame({'notebook_id': notebooks_ids, 'notebook_lang':notebook_lang})
notebooks_lang_df.to_csv(os.path.join(PROCESSING_DATA_PATH, 'notebooks_lang.csv'), index=False)

In [None]:
notebooks_lang_df.head()

In [None]:
notebooks_lang_df.iloc[1]