# Initialization

In [1]:
import pandas as pd
import numpy as np

In [2]:
RANDOM_STATE = 0
TEST_SIZE = 0.15 # 15%

# Language detection dataset

In [3]:
train_language_detection_dataset_path = "data/processed/train_language_detection_dataset.csv"
test_language_detection_dataset_path = "data/processed/test_language_detection_dataset.csv"

## Reading

In [4]:
language_detection_df = pd.read_csv("data/raw/language_detection_dataset.csv")
language_detection_df.head()

Unnamed: 0,id,lan_code,sentence
0,243,rus,Один раз в жизни я делаю хорошее дело... И оно...
1,1276,eng,Let's try something.
2,1277,eng,I have to go to sleep.
3,1280,eng,Today is June 18th and it is Muiriel's birthday!
4,1282,eng,Muiriel is 20 now.


In [5]:
language_detection_df.groupby("lan_code").count()

Unnamed: 0_level_0,id,sentence
lan_code,Unnamed: 1_level_1,Unnamed: 2_level_1
eng,1588752,1588752
rus,911848,911848
ukr,178588,178588


As we can see, we have a pretty strong class imbalance which we need to avoid. We'll split our dataset into train and test datasets now, but we'll address class imbalance issue later when we'll be comparing different models and approaches.

## Splitting

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = language_detection_df[["sentence"]]
y = language_detection_df[["lan_code"]]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, shuffle=True, random_state=RANDOM_STATE, stratify=y
)

## Saving

In [9]:
train_language_detection_df = pd.concat([X_train, y_train], axis=1)
train_language_detection_df

Unnamed: 0,sentence,lan_code
913787,Откуда ты о них знаешь?,rus
226629,Tom persuaded his mother to lend him the car f...,eng
346322,Let's mop the floor.,eng
2124468,Tom leaves his dog in the house when he's at w...,eng
2575375,Они с ним любезны.,rus
...,...,...
2210091,I can't remember exactly who took what.,eng
1981062,"""Молите бога, чтобы он об этом не узнал.""",rus
535985,Я не хочу выходить замуж за Тома.,rus
1016846,"""Я хочу знати, що трапилося з вашою машиною.""",ukr


In [10]:
test_language_detection_df = pd.concat([X_test, y_test], axis=1)
test_language_detection_df

Unnamed: 0,sentence,lan_code
245883,Tom couldn't believe how ugly Mary became as s...,eng
458443,I'm ready to go now.,eng
2409635,Что у тебя на этой грядке?,rus
1998481,"""Он говорит, что хочет есть.""",rus
351962,Я решил туда пойти.,rus
...,...,...
82015,It won't stop bleeding.,eng
375184,"""Let me talk to Tom alone, OK?""",eng
2357958,There is a new round of negotiations.,eng
112902,He gave the same answer as before.,eng


In [11]:
with open(train_language_detection_dataset_path, "+w", encoding="utf-8") as f:
    train_language_detection_df.to_csv(f, index=False)

In [12]:
with open(test_language_detection_dataset_path, "+w", encoding="utf-8") as f:
    test_language_detection_df.to_csv(f, index=False)