In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

In [3]:
from collections import OrderedDict
import csv
import argparse
import re
import os
import random
import pandas as pd
import numpy as np
import math
from pathlib import Path
from time import sleep

from sklearn.utils import shuffle

sys.path.insert(0,'../pyutils/')
sys.path.insert(0, '/home/ektov-av/python35-libs/lib/python3.5/site-packages/') 

from pymystem3 import Mystem
from corpora_process.utils import extract_subsentences, extend,\
                                  normalize_text, preprocessing_setps, margin_sentences
import tqdm    
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

### Extract information about clubid from which discussion boards has been scrapped 

In [None]:
cnt=0
folders=[]
clubsid=[]
folder = Path.joinpath(Path(os.getcwd()), 'csv', 'topics')
for r, d, f in os.walk(folder):
    for folder in d:
        folders.append(os.path.join(r, folder))
for dir_path in folders:
    for the_file in os.listdir(dir_path):
        file_path = os.path.join(dir_path, the_file)
        try:
            if os.path.isfile(file_path) and (the_file.startswith('vk_group_topic')):
                clubsid.append('club'+the_file.split('vk_group_topic-')[-1].split('_')[0])
        except Exception as e:
            print(e) 

In [None]:
set(clubsid)

## Collecting a dataframe for training

### Construct a dataframe for appropriate category of posts

In [None]:
folders=[]
folder = Path.joinpath(Path(os.getcwd()), 'csv', 'topics')
for r, d, f in os.walk(folder):
    for folder in d:
        folders.append(os.path.join(r, folder))

In [None]:
cnt_l = 0
cnt_t = 0
for dir_path in folders:
    findcat_tenant   = re.findall(r'.*подсел.*|.*ищу.*|.*сниму.*|.*поиск.*', dir_path)
    findcat_landlord = re.findall(r'.*сда.*|.*снят.*|.*посуточно.*', dir_path)
    if len(findcat_tenant)!=0:
        for the_file in os.listdir(dir_path):
            file_path = os.path.join(dir_path, the_file)
            try:
                if os.path.isfile(file_path) and (the_file.startswith('vk_group_topic')):
                    cnt_t+=1
                    print('tenant --> ',the_file)
                    if cnt_t==1:
                        df_tenant = pd.read_csv(file_path, sep=',', encoding='utf8')
                    else:
                        df_tmp = pd.read_csv(file_path, sep=',', encoding='utf8')
                        df_tenant = df_tenant.append(df_tmp, ignore_index=True)
            except Exception as e:
                print(e)
    elif len(findcat_landlord)!=0:
        for the_file in os.listdir(dir_path):
            file_path = os.path.join(dir_path, the_file)
            try:
                if os.path.isfile(file_path) and (the_file.startswith('vk_group_topic')):
                    cnt_l+=1
                    print('landlord --> ',the_file)
                    if cnt_l==1:
                        df_landlord = pd.read_csv(file_path, sep=',', encoding='utf8')
                    else:
                        df_tmp = pd.read_csv(file_path, sep=',', encoding='utf8')
                        df_landlord = df_landlord.append(df_tmp, ignore_index=True)
            except Exception as e:
                print(e)                
                
                

In [None]:
len(df_tenant) , len(df_landlord)

In [None]:
df_landlord['LTA_flag'] = [1]*len(df_landlord)
df_tenant['LTA_flag']   = [0]*len(df_tenant)
df = pd.concat([df_landlord,df_tenant],ignore_index=True)
df = shuffle(df)
df.fillna('',inplace=True)
df.rename(columns={'description':'SentimentText','LTA_flag':'Sentiment'},inplace=True)
df = df[df.SentimentText!='']
df.SentimentText = df.SentimentText.apply(lambda x: x[:4000])
df.reset_index(drop=True,inplace=True)

In [None]:
df.head(25)

In [None]:
df.groupby(['Sentiment'])['Sentiment'].count()

In [None]:
max_=0
for indx in df.index:
    len_ = len(df.loc[indx,'SentimentText'].split())
    if len_ > max_:
        max_ = len_
        indx_ = indx
print('max number of words in sample: {} with index: {}'.format(max_,indx_))

## Corpora preprocessing

In [18]:
df['cleaned_text'] = df.SentimentText.progress_apply(lambda x: normalize_text(preprocessing_setps,x))




## Lemmatization process

In [None]:
mstem = Mystem(mystem_bin='/home/mvp_dev/.local/bin/mystem')

### Lemmatization without parallelization

In [None]:
df['lemma_text'] = df.cleaned_text.progress_apply(lambda x: ''.join(mstem.lemmatize(x)[:-1] ))

### Use pandas multiprocessing

In [None]:
from tqdm import tqdm_notebook
from multiprocessing import Pool
from functools import partial

In [None]:
def lemmatize_stem_all(df, col, proc_steps):
    mstem = Mystem(mystem_bin='/home/mvp_dev/.local/bin/mystem')
    df['lemma_text']=df[col].progress_apply(lambda x: ''.join(mstem.lemmatize(utils.normalize_text(proc_steps,x)[:-1])))
def apply_func_to_df(df, col, proc_steps):
    res = lemmatize_stem_all(df, col, proc_steps)

In [None]:
num_part = 3
num_workers = 3
def parallelize_df(df, func=apply_func_to_df):
    df_split = np.array_split(df,num_part)
    pool = Pool(num_workers)
    df = pd.concat(pool.map(partial(func, col='SentimentText',proc_steps=utils.preprocessing_setps), df_split))
    pool.close()
    pool.join()
    return df

In [None]:
df = parallelize_df(df)

## Analyse lemmatized text to fix wrong class label 

In [None]:
df.head()

In [None]:
mask = [True if (re.findall(r'снимать',x)!=[])and(re.findall(r'сдавать|сдать',x)==[]) else False for x in df.lemma_text]

In [None]:
len(df[mask&df.Sentiment==1])

In [None]:
df[mask&df.Sentiment==1]

### Reassing an appropriate class label for each masked corpus   

In [None]:
df.loc[mask&df.Sentiment==1,'Sentiment']=0

In [None]:
df.groupby(['Sentiment'])['Sentiment'].count()

### Read saved dataframe with lemmatization and cleaned corpora

In [4]:
files=[]
folder = os.path.join(os.getcwd(), 'data')
for file_name in os.listdir(folder):
    if file_name.startswith('topics'):
        files.append(os.path.join(folder, file_name))

In [None]:
cnt = 0
for file in files:
    cnt+=1
    if cnt == 1:
        df_tmp = pd.read_csv(file, na_filter=False)
        df = df_tmp
    else:
        df_tmp = pd.read_csv(file, na_filter=False)
        df.append(df_tmp, ignore_index=True, sort=False)

df.fillna('', inplace=True)
# df = df[df.cleaned_text!='']
# remove sentences consited of less than 3 words
mask = np.array([True if len(ele.split())<=3 else False for ele in df['SentimentText']])
df = df[~mask]
    

In [10]:
len(df)

390557

In [7]:
df.head()

Unnamed: 0,SentimentText,Sentiment
0,Сдам комнаты под ключ в центре Феодосии в свое...,0
1,Молодая пара с кошечкой снимет комнату в двух ...,0
2,"Предлагаю койко-место в просторной, чистой,уют...",1
3,Сдам комнату в двухкомнатной квартире (субарен...,1
4,"Сдам комнату в двухкомнатной квартире,20минут ...",1


## Generate ngrams of subsentences from whole sample       

### Try some test cases with utils functions

In [None]:
# res = extract_subsentences(line=df.cleaned_text[4], cutlenght=None)

In [None]:
# margin_sentences(df.cleaned_text[4],repeat_pattern="''",returnlist=False)

In [None]:
# margin_sentences(cutlenght=10, line=extract_subsentences(line=df.cleaned_text[4], cutlenght=None))

## Run generation of subsentences from initial corpus

### Split df into batches to reduse memory consumption

In [None]:
num_part = 3
df_split = np.array_split(df,num_part)

In [None]:
indx = 0
df_split[indx]['subsentences'] = df_split[indx].cleaned_text.progress_apply(lambda x: margin_sentences(cutlenght=50,line=extract_sub_sentences(line=x, cutlenght=None)))
df_split[indx].reset_index(drop=True,inplace=True)

### Try without splitting but with reduced length of generated ngrams tokens

In [19]:
df['subsentences'] = df.cleaned_text.progress_apply(lambda x: margin_sentences(cutlenght=50,line=extract_subsentences(line=x, cutlenght=None)))
df.reset_index(drop=True,inplace=True)




### Tokenize updated dataftame over `subsentences` column

In [20]:
df_fin = extend(df, ['subsentences'], fill_value='', preserve_index=False)

In [21]:
len(df_fin)

19527850

In [22]:
mask = np.array([True if len(ele)==1 else False for ele in df_fin.subsentences])
df_fin = df_fin[~mask]

In [23]:
len(df_fin)

9775881

In [28]:
df_fin.head()

Unnamed: 0,Sentiment,SentimentText,cleaned_text,subsentences
0,0,Сдам комнаты под ключ в центре Феодосии в свое...,сдам комнаты ключ центре феодосии своем частно...,"[сдам, комнаты]"
1,0,Сдам комнаты под ключ в центре Феодосии в свое...,сдам комнаты ключ центре феодосии своем частно...,"[сдам, комнаты, ключ, центре]"
2,0,Сдам комнаты под ключ в центре Феодосии в свое...,сдам комнаты ключ центре феодосии своем частно...,"[сдам, комнаты, ключ, центре, феодосии, своем]"
3,0,Сдам комнаты под ключ в центре Феодосии в свое...,сдам комнаты ключ центре феодосии своем частно...,"[сдам, комнаты, ключ, центре, феодосии, своем,..."
4,0,Сдам комнаты под ключ в центре Феодосии в свое...,сдам комнаты ключ центре феодосии своем частно...,"[сдам, комнаты, ключ, центре, феодосии, своем,..."


### Save ot csv

In [29]:
csvpath = Path.joinpath(Path(os.getcwd()), 'data', 'topics_all_subsentences_gzip.csv')
# df_fin.to_csv(csvpath, index=False, encoding='utf8', compression=None)
df  = pd.read_csv(csvpath, encoding='utf8', compression=None, na_filter=False, 
                  dtype={'SentimentText':str, 'cleaned_text':str, 'Sentiment':int,'subsentences':object})

In [None]:
csvpath = Path.joinpath(Path(os.getcwd()), '../../character-based-cnn/data', 'topics_all_lemma.csv')
df[['SentimentText','Sentiment','lemma_text','cleaned_text','subsentences']].to_csv(csvpath, index=False, encoding='utf8')

In [None]:
frac_df = df.sample(frac=0.02)
indx = pd.Series(df.index)
mask = [not x for x in indx.isin(set(frac_df.index)).tolist()]
df_train_test = df.iloc[mask,:]
len(df_train_test), len(frac_df)

In [9]:
csvpath = Path.joinpath(Path(os.getcwd()), 'data', 'topics_all.csv')
(df[['SentimentText','Sentiment']]).to_csv(csvpath, index=False, encoding='utf8')

In [None]:
csvpath = Path.joinpath(Path(os.getcwd()), '../../character-based-cnn/data', 'topics_train.csv')
(df_train_test[['SentimentText','Sentiment']]).to_csv(csvpath, index=False, encoding='utf8')
csvpath = Path.joinpath(Path(os.getcwd()), '../../character-based-cnn/data', 'topics_val.csv')
(frac_df[['SentimentText','Sentiment']]).to_csv(csvpath, index=False, encoding='utf8')