In [1]:
import pandas as pd
import os
import re
import pymorphy2

In [3]:
def special_symbols(df):
    feature = []
    for token in df["token"]:
        ar = re.findall('\W+', token)
        val = 1 if len(ar)>0 else 0
        feature.append(val)
    return feature

In [4]:
def letters_numbers_combination(df):
    feature = []
    for token in df["token"]:
        letters = re.findall('[а-яА-ЯёЁa-zA-Z]', token)
        numbers = re.findall('\d+', token)
        val = 0
        if len(letters)==0 and len(numbers)==0:
            val = 0
        elif len(numbers)==0:
            val = 1
        elif len(letters)==0:
            val = 2
        else:
            val = 3
        feature.append(val)
    return feature

In [None]:
def vowels_consonants_combination(df):
    feature = []
    for token in df["token"]:
        vowels = re.findall('ауоыиэяюёе', token)
        consonants = re.findall('бвгджйзклмнпрстфхцчшщ', token)
        val = 0
        if len(vowels)==0 and len(consonants)==0:
            val = 0
        elif len(consonants)==0:
            val = 1
        elif len(vowels)==0:
            val = 2
        else:
            val = 3
        feature.append(val)
    return feature

In [5]:
def get_token_length(df):
    feature = []
    for token in df['token']:
        feature.append(len(token))
    return feature

In [6]:
def upper_letters_rate(df):
    upper_russian = 'аоиеёэыуюябвгджзйклмнпрстфхцчшщьъ'.upper()
    upper_english = 'aeioubcdfghjklmnpqrstvwxyz'.upper()
    feature = []
    for token in df['token']:
        uppers = re.findall('[{0}{1}]'.format(upper_russian, upper_english), token)
        val = 0 if len(uppers)==0 else len(uppers)/len(token)
        feature.append(val)
    return feature

In [7]:
def upper_letters_inside(df):
    upper_russian = 'аоиеёэыуюябвгджзйклмнпрстфхцчшщьъ'.upper()
    upper_english = 'aeioubcdfghjklmnpqrstvwxyz'.upper()
    feature = []
    for token in df['token']:
        for i in range(1, len(token)):
            if token[i].isupper():
                feature.append(1)
                break
        else:
            feature.append(0)
    return feature

In [8]:
def create_array_from_file(file):
    array = []
    for l in file:
        k = re.split('\s+', l)
        array.append(k[0])
    return array

In [9]:
def is_in_dictionary(df):
    rus_dict = open("../dictionaries/dictionary.txt", 'r', encoding='utf-8')
    rus_dict_array = create_array_from_file(rus_dict)
    feature = []
    morph = pymorphy2.MorphAnalyzer()
    for token in df['token']:
        p = morph.parse(token)[0]
        norm = p.normal_form
        is_word = 0
        l = 0
        r = len(rus_dict_array)
        while(l < r-1):
            m = (l+r)//2
            if rus_dict_array[m] <= norm:
                l = m
            else:
                r = m
        is_word = rus_dict_array[l]==norm
        feature.append(int(is_word))
    return feature

In [None]:
def process_dataframe(filename):
    path = "../dataset/ready_data/"
    df = pd.read_excel(path+filename)
    df['special_symbols'] = special_symbols(df)
    df['letters_numbers_combination'] = letters_numbers_combination(df)
    df['vowels_consonants_combination'] = vowels_consonants_combination(df)
    df['get_token_length'] = get_token_length(df)
    df['upper_letters_rate'] = upper_letters_rate(df)
    df['upper_letters_inside'] = upper_letters_inside(df)
    df['is_in_dictionary'] = is_in_dictionary(df)
    new_filename = os.path.basename(filename)[0]+"_with_features.csv"
    df.to_csv(path+new_filename)

In [None]:
process_dataframe("merges_data.xlsx")
process_dataframe("6_without_stopwords.xlsx")