In [14]:
import pandas as pd
PATH = 'epir_train/'
articles = pd.read_csv(PATH + 'articles.csv', index_col='Unnamed: 0')
life_situations = pd.read_csv(PATH + 'life_situations.csv', index_col='Unnamed: 0')
news = pd.read_csv(PATH + 'news.csv', index_col='Unnamed: 0')
services = pd.read_csv(PATH + 'services.csv', index_col='Unnamed: 0')

In [15]:
def clean_df(df):
    text_cols = []
    for col in df.columns:
        if col not in ['id', 'sys_lang', 'subid', 'URL']:
            text_cols.append(col)

    df[text_cols] = df[text_cols].astype(str)
    mask = df[text_cols].apply(lambda x: x.str.split().str.len() < 10).all(axis=1)

    print("Rows that will be deleted:")
    print(df[mask])

    df = df.drop(df[mask].index)
    return df

In [16]:
# apply cleaning for all dfs
articles = clean_df(articles)
life_situations = clean_df(life_situations)
news = clean_df(news)
services = clean_df(services)

Rows that will be deleted:
            id sys_lang                           projects  \
2892     81739       kk                               test   
2900    103237       qq                      almaty-almaly   
2906    113705       kk                             atyrau   
2908     27055       qq                                mti   
2921      7844       ru                              enbek   
...        ...      ...                                ...   
102834   17016       kk                        vko-oskemen   
102852  113731       kk                             atyrau   
102853   54996       qq                        vko-kurchum   
102862   38778       qq  kostanai-altynsarin-audany-akimat   
102878  124529       kk                  karaganda-nurinsk   

                                                    title  \
2892                                                 тест   
2900    Áleýmettik meditsinalyq saqtandyrý máseleleri ...   
2906                                         

In [17]:
# list the column names for all dataframes
print("Articles columns:")
print(articles.columns)
print("Life situations columns:")
print(life_situations.columns)
print("News columns:")
print(news.columns)
print("Services columns:")
print(services.columns)

Articles columns:
Index(['id', 'sys_lang', 'projects', 'title', 'content', 'url'], dtype='object')
Life situations columns:
Index(['id', 'sys_lang', 'intro', 'title_main', 'title_sub', 'subid',
       'instruction', 'URL'],
      dtype='object')
News columns:
Index(['id', 'sys_lang', 'projects', 'title', 'body', 'short_description',
       'url'],
      dtype='object')
Services columns:
Index(['id', 'sys_lang', 'additional_info', 'description', 'full_title',
       'short_title', 'title', 'result_description', 'url'],
      dtype='object')


In [18]:
import re

def select_translation(group, big_text_column):
    # Try to find English translations
    en_translations = group[group['sys_lang'] == 'en']
    if not en_translations.empty:
        # If there are multiple English translations, select the one with the most Latin letters
        return en_translations.loc[
            en_translations[big_text_column].apply(
                lambda x: len(re.findall('[a-zA-Z]', x))
                ).idxmax()
        ]
    # If no English translations, try to find Russian translations
    ru_translations = group[group['sys_lang'] == 'ru']
    if not ru_translations.empty:
        return ru_translations.iloc[0]
    # If no Russian translations, try to find Kazakh translations
    kz_translations = group[group['sys_lang'] == 'kz']
    if not kz_translations.empty:
        return kz_translations.iloc[0]
    # If no English, Russian or Kazakh translations, select one of the available translations
    return group.iloc[0]

def select_translations(df, big_text_column, life_situations=False):
    if life_situations:
        return df.groupby(['id', 'subid']).apply(select_translation, big_text_column).reset_index(drop=True)
    else:
        return df.groupby('id').apply(select_translation, big_text_column).reset_index(drop=True)

In [9]:
selected_articles = select_translations(articles, 'content')
selected_articles

Unnamed: 0,id,sys_lang,projects,title,content,url
0,1,en,mam,О защите детей от информации,2 июля 2018 года Главой государства подписан З...,https://www.gov.kz/memleket/entities/mam/press...
1,2,ru,mam,Положение о министерстве информации и комуникц...,Положение о министерстве Министерство информац...,https://www.gov.kz/memleket/entities/mam/press...
2,13,kk,mam,Қазақстан Республикасы Ақпарат және коммуникац...,Қазақстан Республикасы Ақпарат және коммуникац...,https://www.gov.kz/memleket/entities/mam/press...
3,17,ru,mam,Контакты Департамента Цифровизации,Департамент цифровизации 785 Директор Бажаева ...,https://www.gov.kz/memleket/entities/mam/press...
4,20,kk,mam,Халықаралық ынтымақтастық департаментінің байл...,Халықаралық ынтымақтастық департаменті 757 Дир...,https://www.gov.kz/memleket/entities/mam/press...
...,...,...,...,...,...,...
68128,136238,ru,anticorruption,ГРАФИК проведения собеседования,"№ Должность Фамилия, имя, отчество (при его на...",https://www.gov.kz/memleket/entities/anticorru...
68129,136239,qq,almobl-uigur,Dúniezhúzilik Banktiń ókili Himanshi Dzhein t...,"2023 zhylǵy 28 qyrkúiekte ""Mindetti zhinaqtaýs...",https://www.gov.kz/memleket/entities/almobl-ui...
68130,136241,qq,departament-kkbtu-turkestan,Qyzylshanyń aldyn alý. Vaktsinatsiia infektsii...,Qyzylsha aýrýy - kóbine balalar arasynda zhii ...,https://www.gov.kz/memleket/entities/departame...
68131,136243,ru,almobl-uigur,За 8 месяцев чистый инвестиционный доход вклад...,"Чистый инвестиционный доход, начисленный на сч...",https://www.gov.kz/memleket/entities/almobl-ui...


In [10]:
selected_news = select_translations(news, 'body')
selected_news

Unnamed: 0,id,sys_lang,projects,title,body,short_description,url
0,36,ru,mam,Телеграммы и письма по случаю национального пр...,Председатель Китайской Народной Республики Си ...,,https://www.gov.kz/memleket/entities/mam/press...
1,271,kk,mam,Astana Hub алғашқы акселераттау бағдарламасын ...,Astana Hub IT-стартаптары халықаралық технопар...,,https://www.gov.kz/memleket/entities/mam/press...
2,281,en,mam,Abayev attended the North Kazakhstan Akim’s re...,Minister of Information and Communications att...,,https://www.gov.kz/memleket/entities/mam/press...
3,285,en,mam,Hearing of reports on implementation of the Na...,"Headed by Dauren Abayev, Minister of Informati...",,https://www.gov.kz/memleket/entities/mam/press...
4,287,en,mam,The Journalists Contest: “European Union – Kaz...,To celebrate the 25th anniversary of diplomati...,,https://www.gov.kz/memleket/entities/mam/press...
...,...,...,...,...,...,...,...
90618,631457,ru,karaganda,«Взгляд сквозь время»: В Темиртауском музее пр...,В Темиртауском городском историко-краеведческо...,,https://www.gov.kz/memleket/entities/karaganda...
90619,631460,qq,almaty,Aldaǵy bes zhylda Almaty aglomeratsiiasynda ne...,"Qolzhetimdi turǵyn úi, zhańa áleýmettik nysand...",,https://www.gov.kz/memleket/entities/almaty/pr...
90620,631463,kk,almaty,Алматыдағы Құлжа трактіндегі жаңа жолайрығы ав...,Алматыда Бухтарминская көшесі мен Құлжа тракті...,,https://www.gov.kz/memleket/entities/almaty/pr...
90621,631485,qq,pavlodar-din,Q.K. Toqaev mektepke hidzhab kiip barý máseles...,Qasym-Zhomart Toqaev Respýblikalyq pedagogtar ...,,https://www.gov.kz/memleket/entities/pavlodar-...


In [19]:
selected_life_situations = select_translations(life_situations, 'instruction', life_situations=True)
selected_life_situations

Unnamed: 0,id,sys_lang,intro,title_main,title_sub,subid,instruction,URL
0,1,en,"During the preparation for childbirth, you sho...",You are expecting a child,How to register at a policlinic,1,You can register at a policlinic of your place...,https://beta2.egov.kz/situations/1/1?lang=en
1,1,en,"During the preparation for childbirth, you sho...",You are expecting a child,How to register for a doctor's appointment,2,After you have been registered at a policlinic...,https://beta2.egov.kz/situations/1/2?lang=en
2,1,en,"During the preparation for childbirth, you sho...",You are expecting a child,Zhúktilik kezinde dekret demalysyna shyǵý zhán...,3,Dekrettik demalys Zhumys isteitin áielderge bo...,https://beta2.egov.kz/situations/1/3?lang=en
3,1,en,"During the preparation for childbirth, you sho...",You are expecting a child,How to put a child in the queue of kindergarten,5,After the registration of the birth of a child...,https://beta2.egov.kz/situations/1/5?lang=en
4,1,en,"During the preparation for childbirth, you sho...",You are expecting a child,Tólemder men zhárdemaqylardy qalai zhasaýǵa bo...,6,"Sonymen, eń qiyn qadamdar artta qaldy. Endi si...",https://beta2.egov.kz/situations/1/6?lang=en
...,...,...,...,...,...,...,...,...
437,318,ru,При сокращении численности или штата работнико...,Что нужно знать работодателю при сокращении шт...,Соблюдение запретов при сокращении,818,"Для начала необходимо выяснить, попадает ли ра...",https://beta2.egov.kz/situations/318/818?lang=ru
438,318,ru,При сокращении численности или штата работнико...,Что нужно знать работодателю при сокращении шт...,Направление уведомлений,819,Необходимо уведомить самого работника не менее...,https://beta2.egov.kz/situations/318/819?lang=ru
439,318,ru,При сокращении численности или штата работнико...,Что нужно знать работодателю при сокращении шт...,Выплата компенсации в связи с потерей работы,820,"Если все вышеперечисленные условия соблюдены, ...",https://beta2.egov.kz/situations/318/820?lang=ru
440,319,ru,Знакомые или зарубежные организации предлагают...,Как не стать жертвой торговли людьми,Human trafficking forms,821,Human trafficking is not a myth. Monstrous typ...,https://beta2.egov.kz/situations/319/821?lang=ru


In [12]:
selected_services = select_translations(services, 'result_description')
selected_services

Unnamed: 0,id,sys_lang,additional_info,description,full_title,short_title,title,result_description,url
0,2999,en,Service standard,Who is civil officer? It is a citizen of the R...,Enrollment to candidate pool of administrative...,Enrollment to candidate pool of administrative...,Enrollment to candidate pool of administrative...,Extract from the order on enrollment to candid...,https://beta2.egov.kz/services/2999?lang=en
1,3000,en,Rules for providing state services,Dear citizens of the Republic of Kazakhstan! I...,Assignment of state social allowance for disab...,Assignment of state social allowance for disab...,Assignment of state social allowance for disab...,Notice of appointment (refusal in appointment)...,https://beta2.egov.kz/services/3000?lang=en
2,3002,en,Service standard,Information is being updated. What is pension ...,Issuance of information on receipt and flow of...,Issuance of abstract on pension contributions,Certificate of pension contributions,Obtaining a certificate on pension contributio...,https://beta2.egov.kz/services/3002?lang=en
3,3003,en,,Information is being updated. The service is a...,Assignment of state social allowance for disab...,Obtaining an information on the assignment of ...,Obtaining an information on the assignment of ...,Certificate with informational content on assi...,https://beta2.egov.kz/services/3003?lang=en
4,3004,en,Rules for providing State service,Information is being updated. The public servi...,Issuance of documents on advanced training and...,Issuance of documents on advanced training and...,Issuance of documents on advanced training and...,On completion of the qualification: certificat...,https://beta2.egov.kz/services/3004?lang=en
...,...,...,...,...,...,...,...,...,...
882,4553,ru,,,Выдача архивных справок и/или копий архивных д...,Выдача архивных справок и/или копий архивных д...,Выдача архивных справок и/или копий архивных д...,,https://beta2.egov.kz/services/4553?lang=ru
883,4561,ru,,,Корректировка ошибочных сведений объектов недв...,Корректировка ошибочных сведений объектов недв...,Корректировка ошибочных сведений объектов недв...,,https://beta2.egov.kz/services/4561?lang=ru
884,4562,ru,,,Выдача свидетельства на переоборудование автот...,Выдача свидетельства на переоборудование автот...,Выдача свидетельства на переоборудование автот...,,https://beta2.egov.kz/services/4562?lang=ru
885,4565,ru,,,Предоставление гарантий по кредитам в рамках Г...,Предоставление гарантий по кредитам в рамках Г...,Предоставление гарантий по кредитам в рамках Г...,,https://beta2.egov.kz/services/4565?lang=ru


In [20]:
# save the dataframes in data folder
selected_articles.to_csv('data/articles.csv')
selected_news.to_csv('data/news.csv')
selected_life_situations.to_csv('data/life_situations.csv')
selected_services.to_csv('data/services.csv')