In [3]:
import numpy as np, pandas as pd
from tqdm import tqdm_notebook
import json

from pathlib import Path
import os
import lightgbm as lgbm
import csv
import operator

## Prepare the data

In [4]:
def load_x_data(file):

    with open(f'{file}', 'r', encoding='utf-8') as f:
        input_lines = f.read().split('\n')

    data = {
        "title": [],
        "description": [],
        "recent_posts": [],
    }

    for line in input_lines:
        try:
            line = json.loads(line)
        except:
            pass

        if line != '':
            try:
                data['title'].append(line['title'])
                data['description'].append(line['description'])
                data['recent_posts'].append('\n'.join(line['recent_posts']))
            except:
                print('Parse error')

    data = pd.DataFrame(data)
    return data

def load_y_data(file):

    with open(f'{file}', 'r', encoding='utf-8') as f:
        input_lines = f.read().split('\n')

    data = {
        #'lang_code': [],
        'label': []  
    }

    for line in input_lines:
        try:
            line = json.loads(line)
        except:
            pass

        if line != '':
            try:
                #data['lang_code'].append(line['lang_code'])
                data['label'].append('<SEP>'.join(list(line['category'].keys())))
            except:
                print('Parse error')
    data = pd.DataFrame(data)
    return data

In [5]:
x_data = pd.DataFrame(columns = ['title', 'description', 'recent_posts'])
y_data = pd.DataFrame(columns = ['label'])
PATH = '../data/category_ru/'
x_data = pd.concat([x_data, load_x_data(str(PATH + "input.txt"))]).reset_index(drop=True)
y_data = pd.concat([y_data, load_y_data(str(PATH + "output.txt"))]).reset_index(drop=True)
    
x_data['recent_posts'] = x_data['recent_posts'].apply(lambda x: x.replace('\u200b', ''))
data = x_data
data['label'] = y_data

In [6]:
data.head()

Unnamed: 0,title,description,recent_posts,label
0,Wild Field,"Дикое Поле. Историческая рандомность, халдуниа...","""Айя-София"" в болгарской Софии (да, каламбур) ...",Curious Facts<SEP>History<SEP>Movies<SEP>Polit...
1,PASASHOESS,"Уважаемые покупатели, PASHASHOES радостью прив...",только оптом\n WHOLESALE\n36 37² 38² 39² 40\...,Offers & Promotions
2,Путешествуй дешево Piratesru,Пираты Россия - дешевые авиабилеты и самостоят...,Анонс! Завтра распродажа Smartavia: миллион би...,Offers & Promotions<SEP>Travel & Tourism
3,TripToDream,Travel Channel,Анонс! Завтра распродажа Smartavia: миллион би...,Offers & Promotions<SEP>Travel & Tourism
4,Vandrouki,Trаvеl channel,Анонс! Завтра распродажа Smartavia: миллион би...,Offers & Promotions<SEP>Travel & Tourism


In [7]:
mapper = {'Art & Design': '__label__0',
 'Bets & Gambling': '__label__1',
 'Books': '__label__2',
 'Business & Entrepreneurship': '__label__3',
 'Cars & Other Vehicles': '__label__4',
 'Celebrities & Lifestyle': '__label__5',
 'Cryptocurrencies': '__label__6',
 'Culture & Events': '__label__7',
 'Curious Facts': '__label__8',
 'Directories of Channels & Bots': '__label__9',
 'Economy & Finance': '__label__10',
 'Education': '__label__11',
 'Erotic Content': '__label__12',
 'Fashion & Beauty': '__label__13',
 'Fitness': '__label__14',
 'Food & Cooking': '__label__15',
 'Foreign Languages': '__label__16',
 'Health & Medicine': '__label__17',
 'History': '__label__18',
 'Hobbies & Activities': '__label__19',
 'Home & Architecture': '__label__20',
 'Humor & Memes': '__label__21',
 'Investments': '__label__22',
 'Job Listings': '__label__23',
 'Kids & Parenting': '__label__24',
 'Marketing & PR': '__label__25',
 'Motivation & Self-Development': '__label__26',
 'Movies': '__label__27',
 'Music': '__label__28',
 'Offers & Promotions': '__label__29',
 'Pets': '__label__30',
 'Politics & Incidents': '__label__31',
 'Psychology & Relationships': '__label__32',
 'Real Estate': '__label__33',
 'Recreation & Entertainment': '__label__34',
 'Religion & Spirituality': '__label__35',
 'Science': '__label__36',
 'Sports': '__label__37',
 'Technology & Internet': '__label__38',
 'Travel & Tourism': '__label__39',
 'Video Games': '__label__40',
 'Other': '__label__41'}

reverse_mapper = {v: k for k, v in mapper.items()}

In [8]:
y_data = pd.DataFrame(columns=mapper.keys())
y_data

Unnamed: 0,Art & Design,Bets & Gambling,Books,Business & Entrepreneurship,Cars & Other Vehicles,Celebrities & Lifestyle,Cryptocurrencies,Culture & Events,Curious Facts,Directories of Channels & Bots,...,Psychology & Relationships,Real Estate,Recreation & Entertainment,Religion & Spirituality,Science,Sports,Technology & Internet,Travel & Tourism,Video Games,Other


In [9]:
labels_list = pd.Series(data.label.apply(lambda x: x.split("<SEP>")))
y_data_preprocesse = pd.get_dummies(labels_list.apply(pd.Series).stack(), columns=mapper.keys()).sum(level=0)
y_data_preprocesse.head()

Unnamed: 0,Art & Design,Bets & Gambling,Books,Business & Entrepreneurship,Cars & Other Vehicles,Celebrities & Lifestyle,Cryptocurrencies,Culture & Events,Curious Facts,Directories of Channels & Bots,...,Pets,Politics & Incidents,Psychology & Relationships,Real Estate,Religion & Spirituality,Science,Sports,Technology & Internet,Travel & Tourism,Video Games
0,0,0,0,0,0,0,0,0,1,0,...,0,1,0,1,1,0,0,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [10]:
print(y_data_preprocesse.iloc[0])

Art & Design                      0
Bets & Gambling                   0
Books                             0
Business & Entrepreneurship       0
Cars & Other Vehicles             0
Celebrities & Lifestyle           0
Cryptocurrencies                  0
Culture & Events                  0
Curious Facts                     1
Directories of Channels & Bots    0
Economy & Finance                 0
Education                         0
Erotic Content                    0
Fashion & Beauty                  0
Fitness                           0
Food & Cooking                    0
Foreign Language Learning         0
Health & Medicine                 0
History                           1
Hobbies & Activities              0
Home & Architecture               0
Humor & Memes                     0
Investments                       0
Job Listings                      0
Kids & Parenting                  0
Marketing & PR                    0
Motivation & Self-development     0
Movies                      

In [11]:
data.label.iloc[0]

'Curious Facts<SEP>History<SEP>Movies<SEP>Politics & Incidents<SEP>Real Estate<SEP>Religion & Spirituality<SEP>Technology & Internet<SEP>Travel & Tourism<SEP>Other'

In [12]:
data = data.drop("label", axis=1)
data.head()

Unnamed: 0,title,description,recent_posts
0,Wild Field,"Дикое Поле. Историческая рандомность, халдуниа...","""Айя-София"" в болгарской Софии (да, каламбур) ..."
1,PASASHOESS,"Уважаемые покупатели, PASHASHOES радостью прив...",только оптом\n WHOLESALE\n36 37² 38² 39² 40\...
2,Путешествуй дешево Piratesru,Пираты Россия - дешевые авиабилеты и самостоят...,Анонс! Завтра распродажа Smartavia: миллион би...
3,TripToDream,Travel Channel,Анонс! Завтра распродажа Smartavia: миллион би...
4,Vandrouki,Trаvеl channel,Анонс! Завтра распродажа Smartavia: миллион би...


In [13]:
#data = pd.concat([data, y_data_preprocesse], axis=1, join="inner")
#data.head()

In [14]:
# Remove emojis
import re


def deEmojify(text):    
    regex_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"                    
                      "]+", re.UNICODE)
    return regex_pattern.sub(r'',text)

data['recent_posts'] = data['recent_posts'].apply(deEmojify)
data['title'] = data['title'].apply(deEmojify)
data['description'] = data['description'].apply(deEmojify)

In [15]:
data['recent_posts'] = data['recent_posts'].apply(lambda x: x.lower())
data['title'] = data['title'].apply(lambda x: x.lower())
data['description'] = data['description'].apply(lambda x: x.lower())

In [16]:
# Remove adds from all posts

ALL_POSTS = []
for v in tqdm_notebook(data['recent_posts'].apply(lambda x: x.split('\n'))):
    ALL_POSTS.extend(v)
post_counts = pd.Series(ALL_POSTS).value_counts().sort_values(ascending = False)


def filter_posts(posts, threshold = 5):
    posts = posts.split('\n')
    filtered_posts = []
    for post in posts:
        if post_counts[post] < threshold:
            filtered_posts.append(post)
    return '\n'.join(filtered_posts)

for i in range(data.shape[0]):
    data.iloc[i, 2] = filter_posts(data.iloc[i, 2])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for v in tqdm_notebook(data['recent_posts'].apply(lambda x: x.split('\n'))):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16726.0), HTML(value='')))




In [17]:
def removeEmail(text):
    pattern = re.compile("((\w+)(\.|_)?(\w*)@(\w+)(\.(\w+))+)")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeEmail)
data['title'] = data['title'].apply(removeEmail)
data['description'] = data['description'].apply(removeEmail)

In [18]:
def removeUsername(text):
    pattern = re.compile("(@(\w+))")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeUsername)
data['title'] = data['title'].apply(removeUsername)
data['description'] = data['description'].apply(removeUsername)

In [19]:
def removeLinks(text):
    pattern = re.compile("(https?://[^ ]+)")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeLinks)
data['title'] = data['title'].apply(removeLinks)
data['description'] = data['description'].apply(removeLinks)

In [20]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [21]:
from sklearn.model_selection import train_test_split


data['preprocessed_text'] = data.title.map(str) + data.description.map(str) + ' ' + data.recent_posts.map(str)
data = data.drop(['title', 'description', 'recent_posts'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y_data_preprocesse, shuffle=True, train_size=0.7)

In [22]:
print(X_train.shape, X_test.shape)

(11708, 1) (5018, 1)


In [23]:
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [24]:
X_train.shape

(11708, 1)

## Tokenize and vectorize text

In [25]:
tfidf_params = {'max_features':3000, 'use_idf':True}

tfidf = TfidfVectorizer(**tfidf_params)
tfidf_trained = tfidf.fit(X_train.preprocessed_text)
x_train_as_array = tfidf_trained.transform(X_train.preprocessed_text).toarray()

x_test_as_array = tfidf_trained.transform(X_test.preprocessed_text).toarray()

print(x_train_as_array.shape)

(11708, 3000)


## Save TfIdf vectorizer

In [29]:

tfidf_vocabulary_df = pd.Series(tfidf_trained.vocabulary_)
tfidf_vocabulary_df.to_csv('../models/ru/tfidf/tfidf_vocabulary.csv', header=False)

tfidf_idf_df = pd.Series(tfidf_trained.idf_)
tfidf_idf_df.to_csv('../models/ru/tfidf/tfidf_idf.csv', header=False)

tfidf_params_df = pd.Series(tfidf_params)
tfidf_params_df.to_csv('../models/ru/tfidf/tfidf_params.csv', header=False)


In [30]:
test_preprocess_data = pd.DataFrame(X_train)
test_preprocess_data.to_csv('../data/category_ru/test_preprocessed_data.csv', index=False, header=False)
test_preprocess_data.head()

Unnamed: 0,preprocessed_text
9860,заработок от dobrovaпо всем вопросам \n\n\n ...
13310,умный бодибилдингблог чемпиона европы по класс...
11174,fivemsэлитарное общество любителей почитать ко...
10998,aвтоспотсамое важное о каждой новой модели: чт...
14794,работа в нью йорке!работа быстро найдётся для ...


## Evaluate model

In [31]:
from sklearn.multiclass import OneVsRestClassifier

In [32]:
params = { 
	# ... 
    'objective': 'binary',
    'boosting_type': 'gbdt', 
    'metric': 'logistic',
    "learning_rate" : 0.01,
    'task': 'train'
} 

In [33]:
model = OneVsRestClassifier(lgbm.LGBMClassifier(**params), n_jobs=3)

In [34]:
model = model.fit(x_train_as_array, y_train.values.tolist()) 

In [35]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, roc_auc_score
y_pred = model.predict(x_test_as_array)
print('Model score ', model.score(x_test_as_array, y_test))
print('Accuracy ', accuracy_score(y_test, y_pred))
print('F1 ', f1_score(y_test, y_pred.astype(float), average='micro'))
print('Precision ', precision_score(y_test, y_pred.astype(float), average='micro'))
print('Recall ', recall_score(y_test, y_pred.astype(float), average='micro'))
print(y_pred[0])
print(y_test.iloc[0].values)

Model score  0.04703068951773615
Accuracy  0.04703068951773615
F1  0.2911845662923538
Precision  0.8341939986043266
Recall  0.1763751180358829
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0]


## Save lightmgbm models

In [37]:
for i in range(len(model.estimators_)):
    model.estimators_[i].booster_.save_model('../models/ru/lightgbm_model_' + str(i) + '.txt')