# Initialization

In [1]:
import pandas as pd
import numpy as np

In [2]:
RANDOM_STATE = 0
LANGUAGE_DETECTION_TEST_SIZE = 0.15 # 15%
RUSSIAN_SPAM_TEST_SIZE = 200_000
ENGLISH_SPAM_TEST_SIZE = 10_000
UKRAINIAN_SPAM_TEST_SIZE = 15_000

# Language detection dataset

In [3]:
train_language_detection_dataset_path = "data/processed/train_language_detection_dataset.csv"
test_language_detection_dataset_path = "data/processed/test_language_detection_dataset.csv"

## Reading

In [4]:
language_detection_df = pd.read_csv("data/raw/language_detection_dataset.csv")
language_detection_df.head()

Unnamed: 0,id,lan_code,sentence
0,243,rus,–û–¥–∏–Ω —Ä–∞–∑ –≤ –∂–∏–∑–Ω–∏ —è –¥–µ–ª–∞—é —Ö–æ—Ä–æ—à–µ–µ –¥–µ–ª–æ... –ò –æ–Ω–æ...
1,1276,eng,Let's try something.
2,1277,eng,I have to go to sleep.
3,1280,eng,Today is June 18th and it is Muiriel's birthday!
4,1282,eng,Muiriel is 20 now.


In [5]:
language_detection_df.groupby("lan_code").count()

Unnamed: 0_level_0,id,sentence
lan_code,Unnamed: 1_level_1,Unnamed: 2_level_1
eng,1588752,1588752
rus,911848,911848
ukr,178588,178588


As we can see, we have a pretty strong class imbalance which we need to avoid. We'll split our dataset into train and test datasets now, but we'll address class imbalance issue later when we'll be training and comparing different models and approaches.

## Splitting

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X = language_detection_df[["sentence"]]
y = language_detection_df[["lan_code"]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=LANGUAGE_DETECTION_TEST_SIZE, shuffle=True, random_state=RANDOM_STATE, stratify=y
)

## Saving

In [9]:
train_language_detection_df = pd.concat([X_train, y_train], axis=1)
train_language_detection_df

Unnamed: 0,sentence,lan_code
913787,–û—Ç–∫—É–¥–∞ —Ç—ã –æ –Ω–∏—Ö –∑–Ω–∞–µ—à—å?,rus
226629,Tom persuaded his mother to lend him the car f...,eng
346322,Let's mop the floor.,eng
2124468,Tom leaves his dog in the house when he's at w...,eng
2575375,–û–Ω–∏ —Å –Ω–∏–º –ª—é–±–µ–∑–Ω—ã.,rus
...,...,...
2210091,I can't remember exactly who took what.,eng
1981062,"""–ú–æ–ª–∏—Ç–µ –±–æ–≥–∞, —á—Ç–æ–±—ã –æ–Ω –æ–± —ç—Ç–æ–º –Ω–µ —É–∑–Ω–∞–ª.""",rus
535985,–Ø –Ω–µ —Ö–æ—á—É –≤—ã—Ö–æ–¥–∏—Ç—å –∑–∞–º—É–∂ –∑–∞ –¢–æ–º–∞.,rus
1016846,"""–Ø —Ö–æ—á—É –∑–Ω–∞—Ç–∏, —â–æ —Ç—Ä–∞–ø–∏–ª–æ—Å—è –∑ –≤–∞—à–æ—é –º–∞—à–∏–Ω–æ—é.""",ukr


In [10]:
test_language_detection_df = pd.concat([X_test, y_test], axis=1)
test_language_detection_df

Unnamed: 0,sentence,lan_code
245883,Tom couldn't believe how ugly Mary became as s...,eng
458443,I'm ready to go now.,eng
2409635,–ß—Ç–æ —É —Ç–µ–±—è –Ω–∞ —ç—Ç–æ–π –≥—Ä—è–¥–∫–µ?,rus
1998481,"""–û–Ω –≥–æ–≤–æ—Ä–∏—Ç, —á—Ç–æ —Ö–æ—á–µ—Ç –µ—Å—Ç—å.""",rus
351962,–Ø —Ä–µ—à–∏–ª —Ç—É–¥–∞ –ø–æ–π—Ç–∏.,rus
...,...,...
82015,It won't stop bleeding.,eng
375184,"""Let me talk to Tom alone, OK?""",eng
2357958,There is a new round of negotiations.,eng
112902,He gave the same answer as before.,eng


In [11]:
with open(train_language_detection_dataset_path, "+w", encoding="utf-8") as f:
    train_language_detection_df.to_csv(f, index=False)

In [12]:
with open(test_language_detection_dataset_path, "+w", encoding="utf-8") as f:
    test_language_detection_df.to_csv(f, index=False)

# Russian spam detection dataset

In [11]:
train_russian_spam_detection_dataset_path = "data/processed/train_russian_spam_detection_dataset.csv"
test_russian_spam_detection_dataset_path = "data/processed/test_russian_spam_detection_dataset.csv"

## Reading

In [3]:
russian_spam_detection_df = pd.read_csv("data/raw/russian_spam_detection_dataset.csv")
russian_spam_detection_df.head()

Unnamed: 0,message,spam
0,–ü—Ä–∏–≤–µ—Ç —á—Ç–æ—Ç–æ –ø–æ–¥–æ–±–Ω–æ–µ –¥–µ–ª–∞–ª –µ—Å–ª–∏ —á–µ—Ä–µ–∑ usb –∏—Å–ø...,0
1,"–Ø —É–∂–µ —Å–∞–º –æ—Ç–∫–ª—é—á–∏–ª, —á–µ—Ä–µ–∑ —Å—Ç–∞—Ä–ª–∞–π–Ω –º–∞—Å—Ç–µ—Ä",0
2,–°—Ç—Ä–∞–Ω—ã –µ—Å —Å–æ–≥–ª–∞—Å–æ–≤–∞–ª–∏ —Å–∞–Ω–∫—Ü–∏–∏ –ø—Ä–æ—Ç–∏–≤ –±–µ–ª–æ—Ä—É—Å—Å–∫...,0
3,"–ö–∏–ø—è—á—É, –Ω–µ –∫–∏–ø—è—á—É—Å—å —è –¥–æ–±—Ä–∞—è –ø–æ–∫–∞",0
4,"Avalonia for visual studio 2019,2017 –Ω–µ —É–¥–∞–µ—Ç—Å...",0


In [4]:
russian_spam_detection_df.groupby("spam").count()

Unnamed: 0_level_0,message
spam,Unnamed: 1_level_1
0,4164473
1,347040


As we can see, we have a really strong class imbalance which we need to avoid. We'll split our dataset into train and test datasets now, but we'll downsample ham (0) class later before fitting training data into the model.

## Splitting

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = russian_spam_detection_df[["message"]]
y = russian_spam_detection_df[["spam"]]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=RUSSIAN_SPAM_TEST_SIZE, shuffle=True, random_state=RANDOM_STATE, stratify=y
)

## Saving

In [8]:
train_russian_spam_detection_df = pd.concat([X_train, y_train], axis=1)
train_russian_spam_detection_df

Unnamed: 0,message,spam
641692,–°–∞–º—ã–µ —Ç–æ—Ä–≥—É–µ–º—ã–µ –æ–ø—Ü–∏–æ–Ω–Ω—ã–µ –∫–æ–Ω—Ç—Ä–∞–∫—Ç—ã —Å–µ–≥–æ–¥–Ω—è aa...,0
1617410,"–ù–∞–ª–æ–≥ –Ω–∞ –ø—Ä–∏–±—ã–ª—å –æ—Ä–≥–∞–Ω–∏–∑–∞—Ü–∏–π —ç—Ç–æ –ø—Ä—è–º–æ–π –Ω–∞–ª–æ–≥,...",0
393443,–î–∞–≤–∞–π –¥–∏–º–æ–Ω –µ—â–µ 4 —Å—Ç—Ä–∏–º–∞ –ø–æ–º–Ω–∏—Ç—Å—è —Å–∞–º –∫–∞–∫ —Ç–æ —Å...,0
3196877,–í—ã –∂–µ –Ω–µ –¥—É–º–∞–µ—Ç–µ —á—Ç–æ –º–∞—Å—Å–∞ –ª—é–¥–µ–π —ç—Ç–æ –Ω–∞—à –ø–æ—Å–ª–µ...,0
67103,"–•–æ—Ç–∏—Ç–µ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ, —Ö–æ—Ç–∏—Ç–µ –∑–∞—Å—É–Ω—å—Ç–µ –∏—Ö –≤—Å–µ –≤ 1...",0
...,...,...
3938082,–•–∑ —Ç–æ–≥–¥–∞ –ø–æ—á–µ–º—É –Ω–µ —Ä–∞–±–æ—Ç–∞–µ—Ç,0
1911432,"–•–∏—Ç—Ä–æ, —Å–∫–∞–∂–∏",0
4262354,–°—Ç–∞–±–∏–ª—å–Ω–µ–µ —ç—Ç–æ –¥—É–º–∞—Ç—å –ø—Ä–æ –Ω–∞–∑–≤–∞–Ω–∏—è –¥–æ –∏—Å–ø–æ–ª—å–∑–æ...,0
4460475,–ò —É —Ç–µ–±—è –ø–æ–ø–∞–¥–∞–µ—Ç –Ω–∞ –∫–∞–∫–æ–π–Ω–∏–±—É–¥—å devdricard1 –∫...,0


In [9]:
test_russian_spam_detection_df = pd.concat([X_test, y_test], axis=1)
test_russian_spam_detection_df

Unnamed: 0,message,spam
1458199,–¢–∞–∫ –≤—Ä–æ–¥–µ —ç—Ç–æ –µ—â—ë –∫–∞—Ç–∞—Ç—å—Å—è –∏ –∫–∞—Ç–∞—Ç—å—Å—è,0
276276,–¢–∞–∫ –º–æ–∂–Ω–æ –≥–ª—É–±–æ–∫–æ –∫–æ–ø–∞—Ç—å –µ—â–µ —Ç–µ –∫—Ç–æ –ø—Ä–æ–∏–∑–≤–æ–¥–∏—Ç...,0
602641,"–ë–ª–∏–Ω, —á–µ–ª —Ñ–∞–∫—Ç–∏—Ç",0
1449594,–ö–∞–∫ —Å–∏–ª—å–Ω–æ —Ç—ã –ø–æ—Å—Ç–∞—Ä–µ–ª —É–∂–µ —Å—É–ø–µ—Ä —Å—Ç–∞—Ä,0
3936779,"–î–∞ —Ç–∞–∫, —á—Ç–æ —É–∂–µ –ø–æ—á—Ç–∏ –Ω–µ –æ—Å—Ç–∞–ª–æ—Å—å –ø—Ä–∏—á–∏–Ω —Ä–∞–∑–±–∏...",0
...,...,...
202946,–¢–∞–∫ –Ω–∞–ª–µ—Ç –º–æ–∂–Ω–æ –æ—Ç–¥–µ—Ñ—Ñ–∞—Ç—å –ø—Ä–æ—Å—Ç–æ –ø—Ä–æ—Ü–µ–Ω—Ç–æ–º –ø–æ–±...,0
3235431,–ë–∞–Ω—ã –∑–∞ –Ω–µ—Å–ø–æ—Ä—Ç–∏–≤–∫—É –±—É–¥—É –±–æ–π —Å–æ —Å—Ç—Ä–≤ –∏ –º–∞–Ω—Ç–∏–∫–æ—Ä–æ–π,0
27283,"–ï—Å–ª–∏ —É–∂ —Ç–∞–∫ –Ω–∞–¥–æ, –∫–æ–º–ø–∏–ª—è—Ç–æ—Ä –º–æ–∂–µ—Ç –≤ –æ–±—ä–µ–∫—Ç —Å—É...",0
1501603,–≠—Ç–æ –º–µ–Ω—é –Ω–∞—Å—Ç—Ä–∞–∏–≤–∞–µ—Ç—Å—è —á–µ—Ä–µ–∑ popuptheme —É —Ç—É–ª–±–∞—Ä–∞,0


In [12]:
with open(train_russian_spam_detection_dataset_path, "+w", encoding="utf-8") as f:
    train_russian_spam_detection_df.to_csv(f, index=False)

In [13]:
with open(test_russian_spam_detection_dataset_path, "+w", encoding="utf-8") as f:
    test_russian_spam_detection_df.to_csv(f, index=False)

# English spam detection dataset

In [14]:
train_english_spam_detection_dataset_path = "data/processed/train_english_spam_detection_dataset.csv"
test_english_spam_detection_dataset_path = "data/processed/test_english_spam_detection_dataset.csv"

## Reading

In [15]:
english_spam_detection_df = pd.read_csv("data/raw/english_spam_detection_dataset.csv")
english_spam_detection_df.head()

Unnamed: 0,spam,message
0,0,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
1,0,I sent my scores to sophas and i had to do sec...
2,1,We know someone who you know that fancies you....
3,0,Only if you promise your getting out as SOON a...
4,1,Congratulations ur awarded either ÔøΩ500 of CD g...


In [16]:
english_spam_detection_df.groupby("spam").count()

Unnamed: 0_level_0,message
spam,Unnamed: 1_level_1
0,34336
1,20627


As we can see, we have light class imbalance, but amount of samples per each class is small. We may consider augmenting data for our training set for this language dataset later.

## Splitting

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X = english_spam_detection_df[["message"]]
y = english_spam_detection_df[["spam"]]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=ENGLISH_SPAM_TEST_SIZE, shuffle=True, random_state=RANDOM_STATE, stratify=y
)

## Saving

In [20]:
train_english_spam_detection_df = pd.concat([X_train, y_train], axis=1)
train_english_spam_detection_df

Unnamed: 0,message,spam
3323,Had your contract mobile 11 Mnths? Latest Moto...,1
34293,the lottery promotion company limited churchil...,1
49557,would you re - flnance if you knew you ' d sav...,1
53179,"please see attached , clean and redline versio...",0
19200,ilug wilson kamela attn sir madan strictly con...,1
...,...,...
23724,please find attached the above article from th...,0
13944,thanx 4 e brownie its v nice,0
41381,"hi daren , the attached spreadsheet could be u...",0
12333,expand your penis 20 larger in weeks add 3 inc...,1


In [21]:
test_english_spam_detection_df = pd.concat([X_test, y_test], axis=1)
test_english_spam_detection_df

Unnamed: 0,message,spam
25345,read about enron ' s entry into the data stora...,0
6978,crypto forex trade has changed my life i inves...,1
6363,üáπ‚Äåüá∑‚Äåüá∫‚Äåüá∏‚Äåüáπ‚Äåüá™‚Äåüá©‚Äå ‚Üò‚Üò‚Üò‚Üò‚Üò‚Üò‚Üò‚Üò üìóüìóüìóüìóüìóüìóüìóüìó üáµ‚Äåüá±‚Äåüá¶‚Äåüáπ‚Äåüá´‚Äåüá¥‚Äåüá∑...,1
46568,üöÄthats 50 profit fir dock holders as wellüòç ü¶æit...,1
22043,"dale , please , call me on tuesday . my mornin...",0
...,...,...
3981,Eh u send wrongly lar...,0
12200,k wat s tht incident,0
45654,would you like a 250 gas card don t let the cu...,1
30943,hello all : norberto telephoned me this mornin...,0


In [22]:
with open(train_english_spam_detection_dataset_path, "+w", encoding="utf-8") as f:
    train_english_spam_detection_df.to_csv(f, index=False)

In [23]:
with open(test_english_spam_detection_dataset_path, "+w", encoding="utf-8") as f:
    test_english_spam_detection_df.to_csv(f, index=False)

# Ukrainian spam detection dataset

In [31]:
train_ukrainian_spam_detection_dataset_path = "data/processed/train_ukrainian_spam_detection_dataset.csv"
test_ukrainian_spam_detection_dataset_path = "data/processed/test_ukrainian_spam_detection_dataset.csv"

## Reading

In [32]:
ukrainian_spam_detection_df = pd.read_csv("data/raw/ukrainian_spam_detection_dataset.csv")
ukrainian_spam_detection_df.head()

Unnamed: 0,message,spam
0,"__USER__ –¶–µ –ª–µ–∫–∞, —ñ –≤–æ–Ω–∞ –¥—É–∂–µ –∫—Ä—É—Ç–∞, –≤–æ–Ω–∞ –Ω–∞—à–∞...",0
1,"–£ –¥—Ä—É–≥–æ–º—É —Ç—É—Ä—ñ –ø—ñ–¥—Ç—Ä–∏–º–∞–≤ –í—ñ–∫—Ç–æ—Ä–∞ –Æ—â–µ–Ω–∫–∞, –ø–æ—Å—Ç–∞...",0
2,ü™ñ –¢–≤–æ—è —ñ–¥–µ–∞–ª—å–Ω–∞ –ø—ñ–¥—Ä–æ–±—ñ—Ç–∫–∞? –ó–Ω–∞–π–¥–µ–Ω–∞. –ë–µ–∑ –±–æ—Å—ñ...,1
3,üîπ –í–ê–ö–ê–ù–°–Ü–Ø | –ê–î–ú–Ü–ù–Ü–°–¢–†–ê–¢–û–† —É —Ü–µ–Ω—Ç—Ä—ñ –ø—Ä–æ—Ç–µ–∑—É–≤–∞–Ω...,1
4,__USER__ –í—ñ–¥–º—ñ—Ç—å –º–µ–Ω–µ –∫–æ–ª–∏ –∑–º–æ–∂–µ—à,0


In [33]:
ukrainian_spam_detection_df.groupby("spam").count()

Unnamed: 0_level_0,message
spam,Unnamed: 1_level_1
0,65165
1,26221


As we can see, we have noticeable class imbalance which we need to avoid. We may also augment data for our training set for spam (1) class of this language dataset later.

## Splitting

In [34]:
from sklearn.model_selection import train_test_split

In [36]:
X = ukrainian_spam_detection_df[["message"]]
y = ukrainian_spam_detection_df[["spam"]]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=UKRAINIAN_SPAM_TEST_SIZE, shuffle=True, random_state=RANDOM_STATE, stratify=y
)

## Saving

In [38]:
train_ukrainian_spam_detection_df = pd.concat([X_train, y_train], axis=1)
train_ukrainian_spam_detection_df

Unnamed: 0,message,spam
68463,__USER__ –±–æ–∂–µ —è —Ç–∞–∫ –æ–±–æ–∂–Ω—é—é —Ä–∞—Å,0
86119,1689 —Ä–æ–∫—É –ú–æ—Å–∫–æ–≤—ñ—è —ñ –†—ñ—á –ü–æ—Å–ø–æ–ª–∏—Ç–∞ –æ—Å—Ç–∞—Ç–æ—á–Ω–æ —Ä...,0
36865,"–ù–µ –º–æ–∂—É –ø–æ–ø–æ–≤–Ω–∏—Ç–∏ –∫–∞—Ä—Ç–∫—É –ü—Ä–∏–≤–∞—Ç, —Ç–æ–º—É —â–æ —è –∑ –ö...",0
50567,–í –Ω–∞—è–≤–Ω–æ—Å—Ç—ñ —Å–∞–¥–∂–∞–Ω—Ü—ñ –µ–≤–∫–∞–ª—ñ–ø—Ç–∞ üçÄ 7 —Å–æ—Ä—Ç—ñ–≤ –í–∏—Ä–æ...,1
58054,–≤–Ω—ñ–∑–∞–ø–Ω–æ –ø–æ–±–∞—á–∏–ª–∞ —á–æ–ª–æ–≤—ñ–∫–∞ —Ç–æ–ø–ª—î—Å.—Ç–æ—ó—Å—Ç—å –≤ —Ä—É—à...,0
...,...,...
61288,–°—Ça–Ω–¥a—Ä—Ç–Ω—ñ po–∑—Å–∏–ª–∫–∏ –Ω–µ –ø—Ä–∞—Ü—é—é—Ç—å? –üo–≤—ñ–¥o–º–ª–µ–Ω–Ω—è ...,1
44809,üí∏ | –ó–∞ –∫–æ–∂–Ω–æ–≥–æ —Ä–µ—Ñ–µ—Ä–∞–ª–∞ –≤–∏ –±—É–¥–µ—Ç–µ –æ—Ç—Ä–∏–º—É–≤–∞—Ç–∏ 4...,1
19981,–ë–∞–≥—Ä—è–Ω–∏–π –Ü–≤–∞–Ω // ‚Äî –°. 65‚Äî66. –®–µ–≤—á–µ–Ω–∫–æ –õ,0
16700,"__USER__ –û–π –±–ª—ñ–Ω–∞, —Å–æ—Ä—è–Ω, —Ö—Ç–æ—Å—å —Ç—É—Ç –∫–æ–ª–∏—Å—å –∫–∏–¥...",0


In [39]:
test_ukrainian_spam_detection_df = pd.concat([X_test, y_test], axis=1)
test_ukrainian_spam_detection_df

Unnamed: 0,message,spam
69719,"–Ø: –º–∞–º, –∫—É–ø–∏ –º–µ–Ω—ñ ""–∫—Ä—è-–∫—Ä—è"" –ü–∞–ø–∞: —Ö–æ—Ä–æ—à–∞ –ø—ñ—Å–Ω—è!",0
29703,__USER__ –Ø–∫ –¥–æ–±—Ä–µ —à–æ –≤ –Ω–∞—Å —ñ—Å—Ç–æ—Ä—ñ—è –≤–∂–µ –≤—Å–µ,0
3614,–Ø –≤–∞—Å –Ω–µ–Ω–∞–≤–∏–¥–∂—É —Ö—É–π–æ–≤–µ—Ä—Å–∏ üò°üò°üò°üò°üí•üí•üí•üí•,0
3287,"–¢–µ—Ä–º—ñ–Ω–æ–≤–æ —Ö—Ç–æ —Ö–æ—á–µ –∑–∞—Ä–æ–±–∏—Ç–∏ 2000 –≥—Ä–Ω 100,150, ...",1
8008,—Ü—ñ–∫–∞–≤–æ –∞ —Ç—É—Ç —î —á–µ–ª—ñ–∫–∏ —è–∫—ñ –≤ –¥4–¥–∂ –≥—Ä–∞—é—Ç—å,0
...,...,...
9613,__USER__ —Ö–æ—Ç—ñ–ª–æ—Å—è –± –æ–¥–∏–Ω —Ä–∞–∑ –∑—ñ–±—Ä–∞—Ç–∏—Å—è –≤ –¥—ñ—Å–∫–æ...,0
30689,"üå∂ –®–≤–∞—á–∫–∞-–æ–ø–µ—Ä–∞—Ü—ñ–æ–Ωi—Å—Ç–∫–∞ ¬©Ô∏è –î–Ω—ñ–ø—Ä–æ –ú–∞–Ω—É—Ñ–∞–∫—Ç—É—Ä–∞,...",1
71212,"–ó–µ–º–ª—ñ –π –µ—Ç–Ω—ñ—á–Ω—ñ –∫–Ω—è–∑—ñ–≤—Å—Ç–≤–∞, –¥–µ —Ä–∞–Ω—ñ—à–µ –ø—Ä–∞–≤–∏–ª–∏ ...",0
30876,"–ü–æ–¥—ñ—ó –ë–∞–¥–µ–Ω—ñ–≤—Å—å–∫—ñ –≤–∏–±–æ—Ä–∏ –í –ß–µ—Ö—ñ—ó, –Ω–∞ –±–∞–∑—ñ –ª—ñ–≤–æ...",0


In [40]:
with open(train_ukrainian_spam_detection_dataset_path, "+w", encoding="utf-8") as f:
    train_ukrainian_spam_detection_df.to_csv(f, index=False)

In [41]:
with open(test_ukrainian_spam_detection_dataset_path, "+w", encoding="utf-8") as f:
    test_ukrainian_spam_detection_df.to_csv(f, index=False)