In [2]:
import numpy as np, pandas as pd
from tqdm import tqdm_notebook
import json

from pathlib import Path
import os
import lightgbm as lgbm
import csv
import operator

## Prepare the data

In [3]:
def load_x_data(file):

    with open(f'{file}', 'r', encoding='utf-8') as f:
        input_lines = f.read().split('\n')

    data = {
        "title": [],
        "description": [],
        "recent_posts": [],
    }

    for line in input_lines:
        try:
            line = json.loads(line)
        except:
            pass

        if line != '':
            try:
                data['title'].append(line['title'])
                data['description'].append(line['description'])
                data['recent_posts'].append('\n'.join(line['recent_posts']))
            except:
                print('Parse error')

    data = pd.DataFrame(data)
    return data

def load_y_data(file):

    with open(f'{file}', 'r', encoding='utf-8') as f:
        input_lines = f.read().split('\n')

    data = {
        #'lang_code': [],
        'label': []  
    }

    for line in input_lines:
        try:
            line = json.loads(line)
        except:
            pass

        if line != '':
            try:
                #data['lang_code'].append(line['lang_code'])
                data['label'].append('<SEP>'.join(list(line['category'].keys())))
            except:
                print('Parse error')
    data = pd.DataFrame(data)
    return data

In [4]:
x_data = pd.DataFrame(columns = ['title', 'description', 'recent_posts'])
y_data = pd.DataFrame(columns = ['label'])
PATH = '../data/category_ru/'
x_data = pd.concat([x_data, load_x_data(str(PATH + "input.txt"))]).reset_index(drop=True)
y_data = pd.concat([y_data, load_y_data(str(PATH + "output.txt"))]).reset_index(drop=True)
    
x_data['recent_posts'] = x_data['recent_posts'].apply(lambda x: x.replace('\u200b', ''))
data = x_data
data['label'] = y_data

In [5]:
data.head()

Unnamed: 0,title,description,recent_posts,label
0,Wild Field,"Дикое Поле. Историческая рандомность, халдуниа...","""Айя-София"" в болгарской Софии (да, каламбур) ...",Curious Facts<SEP>History<SEP>Movies<SEP>Polit...
1,PASASHOESS,"Уважаемые покупатели, PASHASHOES радостью прив...",только оптом\n WHOLESALE\n36 37² 38² 39² 40\...,Offers & Promotions
2,Путешествуй дешево Piratesru,Пираты Россия - дешевые авиабилеты и самостоят...,Анонс! Завтра распродажа Smartavia: миллион би...,Offers & Promotions<SEP>Travel & Tourism
3,TripToDream,Travel Channel,Анонс! Завтра распродажа Smartavia: миллион би...,Offers & Promotions<SEP>Travel & Tourism
4,Vandrouki,Trаvеl channel,Анонс! Завтра распродажа Smartavia: миллион би...,Offers & Promotions<SEP>Travel & Tourism


In [6]:
mapper = {'Art & Design': 0,
 'Bets & Gambling': 1,
 'Books': 2,
 'Business & Entrepreneurship': 3,
 'Cars & Other Vehicles': 4,
 'Celebrities & Lifestyle': 5,
 'Cryptocurrencies': 6,
 'Culture & Events': 7,
 'Curious Facts': 8,
 'Directories of Channels & Bots': 9,
 'Economy & Finance': 10,
 'Education': 11,
 'Erotic Content': 12,
 'Fashion & Beauty': 13,
 'Fitness': 14,
 'Food & Cooking': 15,
 'Foreign Languages': 16,
 'Health & Medicine': 17,
 'History': 18,
 'Hobbies & Activities': 19,
 'Home & Architecture': 20,
 'Humor & Memes': 21,
 'Investments': 22,
 'Job Listings': 23,
 'Kids & Parenting': 24,
 'Marketing & PR': 25,
 'Motivation & Self-Development': 26,
 'Movies': 27,
 'Music': 28,
 'Offers & Promotions': 29,
 'Pets': 30,
 'Politics & Incidents': 31,
 'Psychology & Relationships': 32,
 'Real Estate': 33,
 'Recreation & Entertainment': 34,
 'Religion & Spirituality': 35,
 'Science': 36,
 'Sports': 37,
 'Technology & Internet': 38,
 'Travel & Tourism': 39,
 'Video Games': 40,
 'Other': 41}

reverse_mapper = {v: k for k, v in mapper.items()}

In [7]:
y_data = pd.DataFrame(columns=mapper.keys())
y_data

Unnamed: 0,Art & Design,Bets & Gambling,Books,Business & Entrepreneurship,Cars & Other Vehicles,Celebrities & Lifestyle,Cryptocurrencies,Culture & Events,Curious Facts,Directories of Channels & Bots,...,Psychology & Relationships,Real Estate,Recreation & Entertainment,Religion & Spirituality,Science,Sports,Technology & Internet,Travel & Tourism,Video Games,Other


In [8]:
labels_list = pd.Series(data.label.apply(lambda x: x.split("<SEP>")))
y_data_preprocesse = pd.get_dummies(labels_list.apply(pd.Series).stack(), columns=mapper.keys()).sum(level=0)
y_data_preprocesse.head()

Unnamed: 0,Art & Design,Bets & Gambling,Books,Business & Entrepreneurship,Cars & Other Vehicles,Celebrities & Lifestyle,Cryptocurrencies,Culture & Events,Curious Facts,Directories of Channels & Bots,...,Pets,Politics & Incidents,Psychology & Relationships,Real Estate,Religion & Spirituality,Science,Sports,Technology & Internet,Travel & Tourism,Video Games
0,0,0,0,0,0,0,0,0,1,0,...,0,1,0,1,1,0,0,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
print(y_data_preprocesse.iloc[0])

Art & Design                      0
Bets & Gambling                   0
Books                             0
Business & Entrepreneurship       0
Cars & Other Vehicles             0
Celebrities & Lifestyle           0
Cryptocurrencies                  0
Culture & Events                  0
Curious Facts                     1
Directories of Channels & Bots    0
Economy & Finance                 0
Education                         0
Erotic Content                    0
Fashion & Beauty                  0
Fitness                           0
Food & Cooking                    0
Foreign Language Learning         0
Health & Medicine                 0
History                           1
Hobbies & Activities              0
Home & Architecture               0
Humor & Memes                     0
Investments                       0
Job Listings                      0
Kids & Parenting                  0
Marketing & PR                    0
Motivation & Self-development     0
Movies                      

In [10]:
data.label.iloc[0]

'Curious Facts<SEP>History<SEP>Movies<SEP>Politics & Incidents<SEP>Real Estate<SEP>Religion & Spirituality<SEP>Technology & Internet<SEP>Travel & Tourism<SEP>Other'

In [11]:
data = data.drop("label", axis=1)
data.head()

Unnamed: 0,title,description,recent_posts
0,Wild Field,"Дикое Поле. Историческая рандомность, халдуниа...","""Айя-София"" в болгарской Софии (да, каламбур) ..."
1,PASASHOESS,"Уважаемые покупатели, PASHASHOES радостью прив...",только оптом\n WHOLESALE\n36 37² 38² 39² 40\...
2,Путешествуй дешево Piratesru,Пираты Россия - дешевые авиабилеты и самостоят...,Анонс! Завтра распродажа Smartavia: миллион би...
3,TripToDream,Travel Channel,Анонс! Завтра распродажа Smartavia: миллион би...
4,Vandrouki,Trаvеl channel,Анонс! Завтра распродажа Smartavia: миллион би...


In [12]:
#data = pd.concat([data, y_data_preprocesse], axis=1, join="inner")
#data.head()

In [13]:
# Remove emojis
import re


def deEmojify(text):    
    regex_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"                    
                      "]+", re.UNICODE)
    return regex_pattern.sub(r'',text)

data['recent_posts'] = data['recent_posts'].apply(deEmojify)
data['title'] = data['title'].apply(deEmojify)
data['description'] = data['description'].apply(deEmojify)

In [14]:
data['recent_posts'] = data['recent_posts'].apply(lambda x: x.lower())
data['title'] = data['title'].apply(lambda x: x.lower())
data['description'] = data['description'].apply(lambda x: x.lower())

In [15]:
# Remove adds from all posts

ALL_POSTS = []
for v in tqdm_notebook(data['recent_posts'].apply(lambda x: x.split('\n'))):
    ALL_POSTS.extend(v)
post_counts = pd.Series(ALL_POSTS).value_counts().sort_values(ascending = False)


def filter_posts(posts, threshold = 5):
    posts = posts.split('\n')
    filtered_posts = []
    for post in posts:
        if post_counts[post] < threshold:
            filtered_posts.append(post)
    return '\n'.join(filtered_posts)

for i in range(data.shape[0]):
    data.iloc[i, 2] = filter_posts(data.iloc[i, 2])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for v in tqdm_notebook(data['recent_posts'].apply(lambda x: x.split('\n'))):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16726.0), HTML(value='')))




In [16]:
def removeEmail(text):
    pattern = re.compile("((\w+)(\.|_)?(\w*)@(\w+)(\.(\w+))+)")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeEmail)
data['title'] = data['title'].apply(removeEmail)
data['description'] = data['description'].apply(removeEmail)

In [17]:
def removeUsername(text):
    pattern = re.compile("(@(\w+))")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeUsername)
data['title'] = data['title'].apply(removeUsername)
data['description'] = data['description'].apply(removeUsername)

In [18]:
def removeLinks(text):
    pattern = re.compile("(https?://[^ ]+)")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeLinks)
data['title'] = data['title'].apply(removeLinks)
data['description'] = data['description'].apply(removeLinks)

In [19]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [20]:
from sklearn.model_selection import train_test_split


data['preprocessed_text'] = data.title.map(str) + data.description.map(str) + ' ' + data.recent_posts.map(str)
data = data.drop(['title', 'description', 'recent_posts'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y_data_preprocesse, shuffle=True, train_size=0.7)

In [21]:
print(X_train.shape, X_test.shape)

(11708, 1) (5018, 1)


In [22]:
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [23]:
X_train.shape

(11708, 1)

## Tokenize and vectorize text

In [50]:
tfidf_params = {'max_features':3000, 'use_idf':True}

tfidf = TfidfVectorizer(**tfidf_params)
tfidf_trained = tfidf.fit(X_train.preprocessed_text)
x_train_as_array = tfidf_trained.transform(X_train.preprocessed_text).toarray()

x_test_as_array = tfidf_trained.transform(X_test.preprocessed_text).toarray()

print(x_train_as_array.shape)

(11708, 3000)


## Save TfIdf vectorizer

In [54]:

tfidf_vocabulary_df = pd.Series(tfidf_trained.vocabulary_)
tfidf_vocabulary_df.to_csv('../model/tfidf/tfidf_vocabulary.csv', header=False)

tfidf_idf_df = pd.Series(tfidf_trained.idf_)
tfidf_idf_df.to_csv('../model/tfidf/tfidf_idf.csv', header=False)

tfidf_params_df = pd.Series(tfidf_params)
tfidf_params_df.to_csv('../model/tfidf/tfidf_params.csv', header=False)


In [60]:
df_test = pd.DataFrame(x_test_as_array)
df_test.to_csv('../data/test_for_c++_inference.tsv', sep='\t', index=False, header=False)
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,3.279704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,4.780864,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
train_preprocess_data = pd.concat([X_train, y_train], axis=1, join="inner")
train_preprocess_data.to_csv('../data/train_preprocessed_data.csv', index=False, header=False)
train_preprocess_data.head()

Unnamed: 0,preprocessed_text,Art & Design,Bets & Gambling,Books,Business & Entrepreneurship,Cars & Other Vehicles,Celebrities & Lifestyle,Cryptocurrencies,Culture & Events,Curious Facts,...,Pets,Politics & Incidents,Psychology & Relationships,Real Estate,Religion & Spirituality,Science,Sports,Technology & Internet,Travel & Tourism,Video Games
14514,okolomilana_bagsаксессуары для мужчин и женщин...,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
16659,отзывы и выплаты бота отзывы и выплаты \nбот ...,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4261,"кряполитика.рязанская политикаанализ ситуации,...",0,1,0,1,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
3482,рыбалка | охота | природана данном канале вы н...,0,1,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,1,0
4273,научная сексологияо сексе простым языком.\n\nп...,0,0,1,0,0,0,0,0,0,...,1,1,1,0,0,1,0,0,0,0


In [62]:
test_preprocess_data = pd.concat([X_test, y_test], axis=1, join="inner")
test_preprocess_data.to_csv('../data/test_preprocessed_data.csv', index=False, header=False)
test_preprocess_data.head()

Unnamed: 0,preprocessed_text,Art & Design,Bets & Gambling,Books,Business & Entrepreneurship,Cars & Other Vehicles,Celebrities & Lifestyle,Cryptocurrencies,Culture & Events,Curious Facts,...,Pets,Politics & Incidents,Psychology & Relationships,Real Estate,Religion & Spirituality,Science,Sports,Technology & Internet,Travel & Tourism,Video Games
2638,namangan jaluzi №1все виды #жалюзи \nна любой...,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4655,инстагентссылка на бот инстагент - нас уже бо...,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
13073,codm news | новостисамый большой новостной паб...,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
8259,берлин с дядей андреем«берлинские истории» от ...,0,0,1,0,0,0,0,1,0,...,0,1,0,0,0,0,1,0,0,1
6216,neural shitпроклятые нейронные сети\n\nдля свя...,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,1,0,1,0,0


## Evaluate model

In [63]:
from sklearn.multiclass import OneVsRestClassifier

In [64]:
params = { 
	# ... 
    'objective': 'binary',
    'boosting_type': 'gbdt', 
    'metric': 'logistic',
    "learning_rate" : 0.01,
    'task': 'train'
} 

In [65]:
model = OneVsRestClassifier(lgbm.LGBMClassifier(**params), n_jobs=3)

In [66]:
model = model.fit(x_train_as_array, y_train.values.tolist()) 

In [67]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, roc_auc_score
y_pred = model.predict(x_test_as_array)
print('Model score ', model.score(x_test_as_array, y_test))
print('Accuracy ', accuracy_score(y_test, y_pred))
print('F1 ', f1_score(y_test, y_pred.astype(float), average='micro'))
print('Precision ', precision_score(y_test, y_pred.astype(float), average='micro'))
print('Recall ', recall_score(y_test, y_pred.astype(float), average='micro'))
print(y_pred[0])
print(y_test.iloc[0].values)

Model score  0.045635711438820246
Accuracy  0.045635711438820246
F1  0.281631162507608
Precision  0.8316319194823868
Recall  0.16951934349355216
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0
 0 0 0 0]


## Save lightmgbm models

In [68]:
for i in range(len(model.estimators_)):
    model.estimators_[i].booster_.save_model('../model/lightgbm_model_ru_' + str(i) + '.txt')