In [1]:
import numpy as np
import pandas as pd 
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
class UkrainianStemmer():
    def __init__(self, word):
        self.word = word
        self.vowel = r'–∞–µ–∏–æ—É—é—è—ñ—ó—î'  # http://uk.wikipedia.org/wiki/–ì–æ–ª–æ—Å–Ω–∏–π_–∑–≤—É–∫
        self.perfectiveground = r'(–∏–≤|–∏–≤—à–∏|–∏–≤—à–∏—Å—å|—ã–≤|—ã–≤—à–∏|—ã–≤—à–∏—Å—å((?<=[–∞—è])(–≤|–≤—à–∏|–≤—à–∏—Å—å)))$'
        # http://uk.wikipedia.org/wiki/–†–µ—Ñ–ª–µ–∫—Å–∏–≤–Ω–µ_–¥—ñ—î—Å–ª–æ–≤–æ
        self.reflexive = r'(—Å[—è—å–∏])$'
        # http://uk.wikipedia.org/wiki/–ü—Ä–∏–∫–º–µ—Ç–Ω–∏–∫ + http://wapedia.mobi/uk/–ü—Ä–∏–∫–º–µ—Ç–Ω–∏–∫
        self.adjective = r'(–∏–º–∏|—ñ–π|–∏–π|–∞|–µ|–æ–≤–∞|–æ–≤–µ|—ñ–≤|—î|—ó–π|—î—î|–µ—î|—è|—ñ–º|–µ–º|–∏–º|—ñ–º|–∏—Ö|—ñ—Ö|–æ—é|–π–º–∏|—ñ–º–∏|—É|—é|–æ–≥–æ|–æ–º—É|–æ—ó)$'
        # http://uk.wikipedia.org/wiki/–î—ñ—î–ø—Ä–∏–∫–º–µ—Ç–Ω–∏–∫
        self.participle = r'(–∏–π|–æ–≥–æ|–æ–º—É|–∏–º|—ñ–º|–∞|—ñ–π|—É|–æ—é|—ñ–π|—ñ|–∏—Ö|–π–º–∏|–∏—Ö)$'
        # http://uk.wikipedia.org/wiki/–î—ñ—î—Å–ª–æ–≤–æ
        self.verb = r'(—Å—å|—Å—è|–∏–≤|–∞—Ç—å|—è—Ç—å|—É|—é|–∞–≤|–∞–ª–∏|—É—á–∏|—è—á–∏|–≤—à–∏|—à–∏|–µ|–º–µ|–∞—Ç–∏|—è—Ç–∏|—î)$'
        # http://uk.wikipedia.org/wiki/–Ü–º–µ–Ω–Ω–∏–∫
        self.noun = r'(–∞|–µ–≤|–æ–≤|–µ|—è–º–∏|–∞–º–∏|–µ–∏|–∏|–µ–π|–æ–π|–∏–π|–π|–∏—è–º|—è–º|–∏–µ–º|–µ–º|–∞–º|–æ–º|–æ|—É|–∞—Ö|–∏—è—Ö|—è—Ö|—ã|—å|–∏—é|—å—é|—é|–∏—è|—å—è|—è|—ñ|–æ–≤—ñ|—ó|–µ—é|—î—é|–æ—é|—î|–µ–≤—ñ|–µ–º|—î–º|—ñ–≤|—ó–≤|—é)$'
        self.rvre = r'[–∞–µ–∏–æ—É—é—è—ñ—ó—î]'
        self.derivational = r'[^–∞–µ–∏–æ—É—é—è—ñ—ó—î][–∞–µ–∏–æ—É—é—è—ñ—ó—î]+[^–∞–µ–∏–æ—É—é—è—ñ—ó—î]+[–∞–µ–∏–æ—É—é—è—ñ—ó—î].*(?<=–æ)—Å—Ç—å?$'
        self.RV = ''

    def ukstemmer_search_preprocess(self, word):
        word = word.lower()
        word = word.replace("'", "")
        word = word.replace("—ë", "–µ")
        word = word.replace("—ä", "—ó")
        return word

    def s(self, st, reg, to):
        orig = st
        self.RV = re.sub(reg, to, st)
        return (orig != self.RV)

    def stem_word(self):
        word = self.ukstemmer_search_preprocess(self.word)
        if not re.search('[–∞–µ–∏–æ—É—é—è—ñ—ó—î]', word):
            stem = word
        else:
            p = re.search(self.rvre, word)
            start = word[0:p.span()[1]]
            self.RV = word[p.span()[1]:]

            # Step 1
            if not self.s(self.RV, self.perfectiveground, ''):

                self.s(self.RV, self.reflexive, '')
                if self.s(self.RV, self.adjective, ''):
                    self.s(self.RV, self.participle, '')
                else:
                    if not self.s(self.RV, self.verb, ''):
                        self.s(self.RV, self.noun, '')
            # Step 2
            self.s(self.RV, '–∏$', '')

            # Step 3
            if re.search(self.derivational, self.RV):
                self.s(self.RV, '–æ—Å—Ç—å$', '')

            # Step 4
            if self.s(self.RV, '—å$', ''):
                self.s(self.RV, '–µ–π—à–µ?$', '')
                self.s(self.RV, '–Ω–Ω$', u'–Ω')

            stem = start + self.RV
        return stem

In [3]:
df = pd.read_csv('../db.csv', index_col=['id'])
df['parsed_text'] = df.parsed_text.str.lower()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2179 entries, 1147 to 5187
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   parsed_text    2179 non-null   object
 1   propaganda     2179 non-null   object
 2   language_code  2179 non-null   object
dtypes: object(3)
memory usage: 68.1+ KB


In [4]:
df.iloc[[80, 266]] = None
df.dropna(inplace=True)

In [5]:
temp = list(map(word_tokenize, df.parsed_text.values))
df['parsed_text'] = temp
df

Unnamed: 0_level_0,parsed_text,propaganda,language_code
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1147,"[‚Äã‚Äã–∫–∏—Ç–∞–π—Å—å–∫–∞, –∂—ñ–Ω–æ—á–∫–∞-–∫—ñ—à–∫–∞, –ø—Ä–æ–¥–æ–≤–∂—É—î, –Ω–∞–º–∞–≥–∞...",f,ua
2,"[1991, :, —Ä–æ—Å—Å–∏—èüá∏üá∞üëç, ‚Äî, –¥–∞–≤–∞–π—Ç–µ, –Ω–µ, –±—É–¥–µ–º, —Å—Å...",t,rus
4,"[üö®–º—ã—Å–ª–∏, –≤—Å–ª—É—Ö, –∫–∞–∂–¥–æ–≥–æ, —Ä–æ—Å–∏—Å—è–Ω–∏–Ω–∞üçæüá∏üáÆ, —Å–µ–π—á–∞—Å...",t,rus
5,"[–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π, –ø–µ—Å–∫–æ–≤–∞, –ø–æ, –ø–æ–≤–æ–¥—É, –¥–∏–∞–ª–æ–≥–∞, —ç—Å...",t,rus
6,"[—Ä–µ–∞–∫—Ü–∏–∏, –º—ã, —Ä–µ—à–∏–ª–∏, –≤–∫–ª—é—á–∏—Ç—å, ,, —Å—Ç–∞–≤–∏–º, –ª–∞–π...",t,rus
...,...,...,...
5182,"[#, #, –∞–Ω–∞–ªi—Ç–∏–∫–∞, –ø–æ–∫–∏, –ø—Ä–æ—Å—Ç—ñ, —É–∫—Ä–∞—ó–Ω—Ü—ñ, —Ä–∞–¥—ñ...",t,ua
5183,"[#, #, —á—É—Ç–∫–∏, –æ–ø—ñ—Å–ª—è, —Ç–æ–≥–æ, ,, —â–æ, —Å—Ç–∞–ª–æ—Å—è, –Ω–∞...",t,ua
5185,"[‚ö°Ô∏è‚ö°Ô∏è‚ö°Ô∏è, #, #, —ñ–Ω—Å–∞–π–¥, –Ω–∞—à–µ, –¥–∂–µ—Ä–µ–ª–æ, –¥–æ–ø–æ–≤—ñ–¥–∞...",t,ua
5186,"[#, #, —á—É—Ç–∫–∏, –∑–≥—ñ–¥–Ω–æ, –Ω–∞—à–∏—Ö, –¥–∂–µ—Ä–µ–ª, –≤, –æ–ø, ,,...",t,ua


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2177 entries, 1147 to 5187
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   parsed_text    2177 non-null   object
 1   propaganda     2177 non-null   object
 2   language_code  2177 non-null   object
dtypes: object(3)
memory usage: 68.0+ KB


In [7]:
def stemming(parsed_text):
  stem_parsed_text = []
  for word in parsed_text:
    ukrainian_stemmer = UkrainianStemmer(word)
    stem_parsed_text.append(ukrainian_stemmer.stem_word())

  return ' '.join(stem_parsed_text)

In [8]:
print(df.parsed_text)
df['parsed_text'] = df['parsed_text'].apply(lambda x: stemming(x))
print(df.parsed_text)

id
1147    [‚Äã‚Äã–∫–∏—Ç–∞–π—Å—å–∫–∞, –∂—ñ–Ω–æ—á–∫–∞-–∫—ñ—à–∫–∞, –ø—Ä–æ–¥–æ–≤–∂—É—î, –Ω–∞–º–∞–≥–∞...
2       [1991, :, —Ä–æ—Å—Å–∏—èüá∏üá∞üëç, ‚Äî, –¥–∞–≤–∞–π—Ç–µ, –Ω–µ, –±—É–¥–µ–º, —Å—Å...
4       [üö®–º—ã—Å–ª–∏, –≤—Å–ª—É—Ö, –∫–∞–∂–¥–æ–≥–æ, —Ä–æ—Å–∏—Å—è–Ω–∏–Ω–∞üçæüá∏üáÆ, —Å–µ–π—á–∞—Å...
5       [–∫–æ–º–º–µ–Ω—Ç–∞—Ä–∏–π, –ø–µ—Å–∫–æ–≤–∞, –ø–æ, –ø–æ–≤–æ–¥—É, –¥–∏–∞–ª–æ–≥–∞, —ç—Å...
6       [—Ä–µ–∞–∫—Ü–∏–∏, –º—ã, —Ä–µ—à–∏–ª–∏, –≤–∫–ª—é—á–∏—Ç—å, ,, —Å—Ç–∞–≤–∏–º, –ª–∞–π...
                              ...                        
5182    [#, #, –∞–Ω–∞–ªi—Ç–∏–∫–∞, –ø–æ–∫–∏, –ø—Ä–æ—Å—Ç—ñ, —É–∫—Ä–∞—ó–Ω—Ü—ñ, —Ä–∞–¥—ñ...
5183    [#, #, —á—É—Ç–∫–∏, –æ–ø—ñ—Å–ª—è, —Ç–æ–≥–æ, ,, —â–æ, —Å—Ç–∞–ª–æ—Å—è, –Ω–∞...
5185    [‚ö°Ô∏è‚ö°Ô∏è‚ö°Ô∏è, #, #, —ñ–Ω—Å–∞–π–¥, –Ω–∞—à–µ, –¥–∂–µ—Ä–µ–ª–æ, –¥–æ–ø–æ–≤—ñ–¥–∞...
5186    [#, #, —á—É—Ç–∫–∏, –∑–≥—ñ–¥–Ω–æ, –Ω–∞—à–∏—Ö, –¥–∂–µ—Ä–µ–ª, –≤, –æ–ø, ,,...
5187    [#, #, —ñ–Ω—Å–∞–π–¥, –Ω–∞—à, –µ–∫—Å–ø–µ—Ä—Ç, –∫—ñ–≤–∞, –Ω–µ, –ø—Ä–æ—Å—Ç–æ,...
Name: parsed_text, Length: 2

In [9]:
my_vectorizer = CountVectorizer()
# print(df['parsed_text'])
# my_vectorizer.fit(df['parsed_text'])

In [10]:
df['parsed_text'] = my_vectorizer.fit_transform(df['parsed_text']).toarray()
x, y = df['parsed_text'].values.reshape(-1, 1), df['propaganda'].apply(lambda y: 0 if y=='f' else 1).values.astype(int)
print(df['propaganda'].apply(lambda y: 0 if y=='f' else 1).describe())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print(x_train.shape, y_train.shape)

count    2177.000000
mean        0.369775
std         0.482855
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: propaganda, dtype: float64
(1523, 1) (1523,)


In [11]:
# df['new_col'] = df['parsed_text'].apply(lambda x: x.shape[1])
# print(df['new_col'].describe())

In [12]:
model = LogisticRegression()
model.fit(x_train, y_train)
x, y

(array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]]), array([0, 1, 1, ..., 1, 1, 1]))

In [13]:
model.score(x_test, y_test)

0.6422018348623854