In [1]:
import time

from ipywidgets import IntProgress
from IPython.display import display


import pandas as pd
from collections import defaultdict

import gensim.models

DATA_PATH = '/data/'
CAPTION_PATH = DATA_PATH + 'captions/'
RAW_CAPTION_PATH = CAPTION_PATH + 'raw/'

ClEAN_CAPTION_PATH = CAPTION_PATH + 'cleaned/'
LEM_CAPTION_PATH = CAPTION_PATH + 'lem/'

cities = ['spb', 'moscow']#, 'nyc', 'london']
years = ['2017', '2018', '2019', '2020']
files = []
for city in cities:
    for year in years:
        files.append([city, year])

def csv_path(path, city, year):
    return path + city + '_posts_' + year + '.csv'


import fasttext

PRETRAINED_MODEL_PATH = '/data/source/lid.176.bin'
lang = fasttext.load_model(PRETRAINED_MODEL_PATH) # model for defining a language of text 



This cell 
    1. replceces '\n', tab symbols and other space symbols to simple space
    2. removes duplicated spaces 
    3. removes all symbols, which aren't a letter or space (exception - numbers in a word - '8марта')
    4. defines language of posts
    5. removes posts without words
    6. label language of post

In [7]:
from gensim.parsing.preprocessing import strip_multiple_whitespaces
MIN_DOCUMENT_SIZE = 1
trans_rules = str.maketrans({ 
    "\n": " ", "\r": " ", "\xa0": " ",
    "#": " #", #"@": " @",
    "_": " "
})

def is_allowed_chars(char: str):
    return char.isalpha() or char.isdigit() or char.isspace() or char in set(['#'])#, '@'])

def is_allowed_word(word: str):
    return len(word) > 0 and not word.isdigit()

def preprocessor(s: str):
    # all letter to lower case
    s = s.lower() 
    s = s.translate(trans_rules)
    # remove all symbol, which aren't letter or num
    s = ''.join([char for char in s if is_allowed_chars(char)])
    # replace all space symbol like space, tab, \n to simple space and remove double and for space
    s = strip_multiple_whitespaces(s)
    # remove empty words and words from filtered words 
    s = ' '.join([word for word in s.split(' ') if is_allowed_word(word)])
    return s


cities_counted_posts = dict([(city, 0) for city in cities])
old_time = time.time()
start_time = old_time

for city in cities:
    for year in years:
        # load file and drop documents with empty text
        df = pd.read_csv(csv_path(RAW_CAPTION_PATH, city, year)).dropna()
        # preprocess of corpus
        df['caption'] = df['caption'].apply(preprocessor)
        # remove empty documents
        df = df[df['caption'].map(lambda t: len(t.split())) >= MIN_DOCUMENT_SIZE]
        
        languages = []
        for s in df['caption']:
            languages.append(lang.predict(s)[0][0])
        df['lang'] = languages
        
        cities_counted_posts[city] += len(df)
        
        # save results
        df.to_csv(r'' + csv_path(ClEAN_CAPTION_PATH, city, year), index=False)
        print(f"finished {city} {year}")
        
    print(f'finished {city}; time on stage: {time.time() - old_time}; counted posts: {cities_counted_posts[city]}')
    old_time = time.time()

print(f'completed with time: {time.time() - start_time}')
'8441'

finished spb 2017
finished spb 2018
finished spb 2019
finished spb 2020
finished spb; time on stage: 3446.691294670105; counted posts: 16632760
finished moscow 2017
finished moscow 2018
finished moscow 2019
finished moscow 2020
finished moscow; time on stage: 4696.5913808345795; counted posts: 19327608
completed with time: 8143.283170223236


'8441'

Вместо леммитицазии всех докуентов поотдельности можно обработать весь текст, после составить словарь всех слов, леммитизировать все по одному разу и перезаписать текст постов

algorithm of lemmatize using dictionary of all words in dataset

In [8]:
# fast lemmatize
from collections import defaultdict, Counter
import time
from pymystem3 import Mystem # for normalization of text
my_stem = Mystem()

start_time = time.time()
old_time = time.time()

global_words = defaultdict(int)
global_hashtags = defaultdict(int)
idf = defaultdict(int)
idf_lemmatized = defaultdict(int)
words_count = 0
valid_langs = set(['__label__ru', '__label__en'])


# load all words to dictionary
for city, year in files:
    df = pd.read_csv(csv_path(ClEAN_CAPTION_PATH, city, year)).dropna()
    df = df[df.lang.isin(valid_langs)]
    for caption in df['caption']:
        words = caption.split(' ')
        words_count += len(words)
        counts = dict(Counter(words))
        for word in counts:
            if word[0] in ['#', '@']:
                global_hashtags[word] += counts[word]
            else:
                global_words[word] += counts[word]
            idf[word] += 1
    
    print(f'loaded words from {city} {year}')

print(f'time on stage: {time.time() - old_time}')
old_time = time.time()



# create lemmatized dictionary
word_to_lem = dict([(word, ''.join(my_stem.lemmatize(word)[:-1])) for word in global_words]) # [:-1] for remove \n after lemmatize
print(f'finish of lemmatize; time on stage: {time.time() - old_time}')
old_time = time.time()

for hashtag in global_hashtags:
    word_to_lem[hashtag] = hashtag

def lemmatize_caption(caption: str):
    return ' '.join([word_to_lem[word] for word in caption.split() if word in word_to_lem])

posts_count = 0
# replece all words in dataset to lemmatized words
for city, year in files:
    df = pd.read_csv(csv_path(ClEAN_CAPTION_PATH, city, year)).dropna()
    # df = df[df.lang.isin(valid_langs)]
    df['caption'] = df['caption'].apply(lemmatize_caption)
    df = df.dropna()
    posts_count += len(df)
    for caption in df['caption']:
        for word in set(caption.split(' ')):
            idf_lemmatized[word] += 1
    df.to_csv(r'' + csv_path(LEM_CAPTION_PATH, city, year), index=False)
    print(f'replaced words for {city} {year}')


print(f'finish; time on stage: {time.time() - old_time}; all time: {time.time() - start_time}')

loaded words from spb 2017
loaded words from spb 2018
loaded words from spb 2019
loaded words from spb 2020
loaded words from moscow 2017
loaded words from moscow 2018
loaded words from moscow 2019
loaded words from moscow 2020
time on stage: 1694.864534854889
finish of lemmatize; time on stage: 689.824747800827
replaced words for spb 2017
replaced words for spb 2018
replaced words for spb 2019
replaced words for spb 2020
replaced words for moscow 2017
replaced words for moscow 2018
replaced words for moscow 2019
replaced words for moscow 2020
finish; time on stage: 1827.7670729160309; all time: 4212.4567086696625


In [11]:
# save dictionary 
import math

global_words.update(global_hashtags)

global_lems = defaultdict(int)
for word in global_words:
    global_lems[word_to_lem[word]] += global_words[word]
    
columns = ['word', 'idf', 'count_usages', 'lemmatized', 'idf_lem', 'count_usages_lem']

def make_row(word: str):
    lem = word_to_lem[word]
    word_idf = math.log(posts_count / idf[word])
    lem_idf = math.log(posts_count / idf_lemmatized[lem])
    return (word, word_idf, global_words[word], lem, lem_idf, global_lems[lem])

rows = [make_row(word) for word in global_words]

df = pd.DataFrame(rows, columns=columns)
df_lem = pd.DataFrame(zip(*list(zip(*rows))[3:]), columns=columns[3:]).drop_duplicates()
df.to_csv(r'' + CAPTION_PATH + 'dict.csv', index=False)
df_lem.to_csv(r'' + CAPTION_PATH + 'dict_lem.csv', index=False)

In [17]:
key = 'gfhj'
word = key if not key[0] in ['#', '@'] else key[1:]
word

'gfhj'

In [41]:
# fast lemmatize
from collections import defaultdict, Counter
import time
from pymystem3 import Mystem # for normalization of text
my_stem = Mystem()

start_time = time.time()
old_time = time.time()

global_words = defaultdict(int)
idf = defaultdict(int)
idf_lemmatized = defaultdict(int)
words_count = 0
valid_langs = set(['__label__ru', '__label__en'])


# load all words to dictionary
for city, year in files:
    df = pd.read_csv(csv_path(ClEAN_CAPTION_PATH, city, year)).dropna()
    df = df[df.lang.isin(valid_langs)]
    for caption in df['caption']:
        words = caption.split(' ')
        words_count += len(words)
        counts = dict(Counter(words))
        for key in counts:
            word = key if not key[0] in ['#', '@'] else key[1:]
            global_words[word] += counts[key]
            idf[word] += 1
    
    print(f'loaded words from {city} {year}')

print(f'time on stage: {time.time() - old_time}')
old_time = time.time()



# create lemmatized dictionary
word_to_lem = dict([(word, ''.join(my_stem.lemmatize(word)[:-1])) for word in global_words]) # [:-1] for remove \n after lemmatize
print(f'finish of lemmatize; time on stage: {time.time() - old_time}')
old_time = time.time()

def lemmatize_caption(caption: str):
    return ' '.join([word_to_lem[word] for word in caption.split() if word in word_to_lem])

posts_count = 0
# replece all words in dataset to lemmatized words
for city, year in files:
    df = pd.read_csv(csv_path(ClEAN_CAPTION_PATH, city, year)).dropna()
    # df = df[df.lang.isin(valid_langs)]
    df['caption'] = df['caption'].apply(lambda s: s.replace('@', '').replace('#', ''))
    df['caption'] = df['caption'].apply(lemmatize_caption)
    df = df.dropna()
    posts_count += len(df)
    for caption in df['caption']:
        for word in set(caption.split(' ')):
            idf_lemmatized[word] += 1


import math

global_lems = defaultdict(int)
for word in global_words:
    global_lems[word_to_lem[word]] += global_words[word]
    
columns = ['word', 'idf', 'count_usages', 'lemmatized', 'idf_lem', 'count_usages_lem']

def make_row(word: str):
    lem = word_to_lem[word]
    word_idf = math.log(posts_count / idf[word])
    lem_idf = math.log(posts_count / idf_lemmatized[lem])
    return (word, word_idf, global_words[word], lem, lem_idf, global_lems[lem])

rows = [make_row(word) for word in global_words]

df = pd.DataFrame(rows, columns=columns)
df_lem = pd.DataFrame(zip(*list(zip(*rows))[3:]), columns=columns[3:]).drop_duplicates()
df.to_csv(r'' + CAPTION_PATH + 'dict_without_hash.csv', index=False)
df_lem.to_csv(r'' + CAPTION_PATH + 'dict_lem_without_hash.csv', index=False)

loaded words from spb 2017
loaded words from spb 2018
loaded words from spb 2019
loaded words from spb 2020
loaded words from moscow 2017
loaded words from moscow 2018
loaded words from moscow 2019
loaded words from moscow 2020
time on stage: 1757.6985099315643
finish of lemmatize; time on stage: 1086.2971937656403


In [23]:
word_to_lem['сондляслабаков']

'сондляслабак'

In [36]:
idf_lemmatized['сондляслабак']

1116

In [37]:
global_words['сондляслабак']

0

In [40]:
word_to_lem['сондляслабаков']

'сондляслабак'