In [2]:
import numpy as np, pandas as pd
from tqdm import tqdm_notebook
import json

from pathlib import Path
import os
import fasttext
import csv

## Prepare the data

In [73]:
def load_data(file):

    with open(f'{file}', 'r', encoding='utf-8') as f:
        input_lines = f.read().split('\n')

    data = {
        "title": [],
        "description": [],
        "recent_posts": [],
    }

    for line in input_lines:
        try:
            line = json.loads(line)
        except:
            pass

        if line != '':
            try:
                data['title'].append(line['title'])
                data['description'].append(line['description'])
                data['recent_posts'].append('\n'.join(line['recent_posts']))
            except:
                print('Parse error')

    data = pd.DataFrame(data)
    data['label'] = file.split('/')[-1].split('.')[0]
    
    return data

In [74]:
data = pd.DataFrame(columns = ['title', 'description', 'recent_posts', 'label'])
PATH = Path('tgparser/RU_TGSTAT_DATA/')
for file in os.listdir(PATH):
    data = pd.concat([data, load_data(str(PATH / file))]).reset_index(drop=True)
    
data['recent_posts'] = data['recent_posts'].apply(lambda x: x.replace('\u200b', ''))
data['label'] = data['label'].apply(lambda x: x.split('_')[0])

Parse error
Parse error
Parse error


In [75]:
mapper = {'Art & Design': '__label__0',
 'Bets & Gambling': '__label__1',
 'Books': '__label__2',
 'Business & Entrepreneurship': '__label__3',
 'Cars & Other Vehicles': '__label__4',
 'Celebrities & Lifestyle': '__label__5',
 'Cryptocurrencies': '__label__6',
 'Culture & Events': '__label__7',
 'Curious Facts': '__label__8',
 'Directories of Channels & Bots': '__label__9',
 'Economy & Finance': '__label__10',
 'Education': '__label__11',
 'Erotic Content': '__label__12',
 'Fashion & Beauty': '__label__13',
 'Fitness': '__label__14',
 'Food & Cooking': '__label__15',
 'Foreign Languages': '__label__16',
 'Health & Medicine': '__label__17',
 'History': '__label__18',
 'Hobbies & Activities': '__label__19',
 'Home & Architecture': '__label__20',
 'Humor & Memes': '__label__21',
 'Investments': '__label__22',
 'Job Listings': '__label__23',
 'Kids & Parenting': '__label__24',
 'Marketing & PR': '__label__25',
 'Motivation & Self-Development': '__label__26',
 'Movies': '__label__27',
 'Music': '__label__28',
 'Offers & Promotions': '__label__29',
 'Pets': '__label__30',
 'Politics & Incidents': '__label__31',
 'Psychology & Relationships': '__label__32',
 'Real Estate': '__label__33',
 'Recreation & Entertainment': '__label__34',
 'Religion & Spirituality': '__label__35',
 'Science': '__label__36',
 'Sports': '__label__37',
 'Technology & Internet': '__label__38',
 'Travel & Tourism': '__label__39',
 'Video Games': '__label__40',
 'Other': '__label__41'}

reverse_mapper = {v: k for k, v in mapper.items()}

In [76]:
# Remove emojis
import re


def deEmojify(text):    
    regex_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"                    
                      "]+", re.UNICODE)
    return regex_pattern.sub(r'',text)

data['recent_posts'] = data['recent_posts'].apply(deEmojify)
data['title'] = data['title'].apply(deEmojify)
data['description'] = data['description'].apply(deEmojify)

In [77]:
data['recent_posts'] = data['recent_posts'].apply(lambda x: x.lower())
data['title'] = data['title'].apply(lambda x: x.lower())
data['description'] = data['description'].apply(lambda x: x.lower())

In [78]:
# Remove adds from all posts

ALL_POSTS = []
for v in tqdm_notebook(data['recent_posts'].apply(lambda x: x.split('\n'))):
    ALL_POSTS.extend(v)
post_counts = pd.Series(ALL_POSTS).value_counts().sort_values(ascending = False)


def filter_posts(posts, threshold = 5):
    posts = posts.split('\n')
    filtered_posts = []
    for post in posts:
        if post_counts[post] < threshold:
            filtered_posts.append(post)
    return '\n'.join(filtered_posts)

for i in range(data.shape[0]):
    data.iloc[i, 2] = filter_posts(data.iloc[i, 2])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/7070 [00:00<?, ?it/s]

In [79]:
def removeEmail(text):
    pattern = re.compile("((\w+)(\.|_)?(\w*)@(\w+)(\.(\w+))+)")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeEmail)
data['title'] = data['title'].apply(removeEmail)
data['description'] = data['description'].apply(removeEmail)

In [80]:
def removeUsername(text):
    pattern = re.compile("(@(\w+))")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeUsername)
data['title'] = data['title'].apply(removeUsername)
data['description'] = data['description'].apply(removeUsername)

In [81]:
def removeLinks(text):
    pattern = re.compile("(https?://[^ ]+)")
    return pattern.sub(r'', text)

data['recent_posts'] = data['recent_posts'].apply(removeLinks)
data['title'] = data['title'].apply(removeLinks)
data['description'] = data['description'].apply(removeLinks)

In [82]:
from tqdm import tqdm
tqdm.pandas()

In [83]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, shuffle = True, train_size = 0.7)

train = train.reset_index(drop = True)
test = test.reset_index(drop = True)

In [84]:
def sample_data(data):
    output_data = []
    N_SAMPLES = 10
    for i in range(data.shape[0]):
        for j in range(N_SAMPLES):
            titleText = data.iloc[i, 0] + data.iloc[i, 1]
            posts = '\n'.join(pd.Series(data.iloc[i, 2].split('\n')).sample(n = 5, replace = True).values)
            output_data.append([mapper[data.iloc[i, 3]], titleText + posts])
    return pd.DataFrame(output_data)

In [85]:
print(train.shape, test.shape)

(4949, 4) (2121, 4)


In [86]:
train = sample_data(train)
test = sample_data(test)

In [87]:
print(train.shape, test.shape)

(49490, 2) (21210, 2)


In [88]:
train.to_csv('data/train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")


test.to_csv('data/test.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

## Evaluate model

In [89]:
# Training the fastText classifier
model = fasttext.train_supervised('data/train.txt', lr=0.5, epoch=10, wordNgrams=1, bucket=200_000, dim=100, loss='ova')

In [90]:
# Evaluating performance on the entire test file
model.test('data/test.txt')                      

(21210, 0.48434700612918435, 0.48434700612918435)

In [91]:
preds = []
for i in range(test.shape[0]):
    labels, cur_preds = model.predict(test.iloc[i, 1].replace('\n', ''), k = -1)
    pred_mapper = {}
    for i in range(len(labels)):
        pred_mapper[labels[i]] = cur_preds[i]
        
    cur_preds = pd.Series(pred_mapper)[sorted(labels, key=lambda x: int(x.split('__')[-1]))].values 
    preds.append(list(cur_preds))
preds = np.array(preds)

In [92]:
y_true = np.zeros(preds.shape)

target_sorting = np.array(sorted(test[0].unique(), key = lambda x: int(x.split('__')[-1])))
for i in range(y_true.shape[0]):
    y_true[i, np.argwhere(target_sorting == test[0][i])[0][0]] = 1

In [93]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, roc_auc_score
y_pred = preds
print('Accuracy', accuracy_score(np.argmax(y_true, axis = 1), np.argmax(y_pred, axis = 1)))
print('F1', f1_score(y_true, (y_pred == y_pred.max(axis=1, keepdims=1)).astype(float), average='micro'))
print('Precision', precision_score(y_true, (y_pred == y_pred.max(axis=1, keepdims=1)).astype(float), average='micro'))
print('Recall', recall_score(y_true, (y_pred == y_pred.max(axis=1, keepdims=1)).astype(float), average='micro'))

Accuracy 0.48217821782178216
F1 0.3905931228903149
Precision 0.321939586645469
Recall 0.49646393210749645


## Evaluate quantized model

In [94]:
# Quantize the model with retraining
model.quantize(input='data/train.txt', qnorm=True, retrain=True, cutoff=200000)

In [95]:
# Evaluating performance on the entire test file
model.test('data/test.txt')                      

(21210, 0.4821310702498821, 0.4821310702498821)

In [None]:
preds = []
for i in range(test.shape[0]):
    labels, cur_preds = model.predict(test.iloc[i, 1].replace('\n', ''), k = -1)
    pred_mapper = {}
    for i in range(len(labels)):
        pred_mapper[labels[i]] = cur_preds[i]
        
    cur_preds = pd.Series(pred_mapper)[sorted(labels, key=lambda x: int(x.split('__')[-1]))].values 
    preds.append(list(cur_preds))
preds = np.array(preds)

In [None]:
y_true = np.zeros(preds.shape)

target_sorting = np.array(sorted(test[0].unique(), key = lambda x: int(x.split('__')[-1])))
for i in range(y_true.shape[0]):
    y_true[i, np.argwhere(target_sorting == test[0][i])[0][0]] = 1

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, roc_auc_score
y_pred = preds
print('Accuracy', accuracy_score(np.argmax(y_true, axis = 1), np.argmax(y_pred, axis = 1)))
print('F1', f1_score(y_true, (y_pred == y_pred.max(axis=1, keepdims=1)).astype(float), average='micro'))
print('Precision', precision_score(y_true, (y_pred == y_pred.max(axis=1, keepdims=1)).astype(float), average='micro'))
print('Recall', recall_score(y_true, (y_pred == y_pred.max(axis=1, keepdims=1)).astype(float), average='micro'))

## Train final model

In [99]:
full_data = sample_data(data)

In [100]:
full_data.to_csv('data/train.txt', 
                                          index = False, 
                                          sep = ' ',
                                          header = None, 
                                          quoting = csv.QUOTE_NONE, 
                                          quotechar = "", 
                                          escapechar = " ")

In [None]:
model = fasttext.train_supervised('data/train.txt', lr=0.5, epoch=10, wordNgrams=1, bucket=200_000, dim=100, loss='ova')

In [102]:
model.quantize(input='data/train.txt', qnorm=True, retrain=True, cutoff=200000)

In [103]:
# Save quantized model
model.save_model('models/fasttext_42cat_preprocessed.ftz')