In [21]:
from pymystem3 import Mystem
my_stem = Mystem()

import nltk
import time

from ipywidgets import IntProgress
from IPython.display import display

try:
    nltk.corpus.stopwords.words('english')
except:
    nltk.download('stopwords')

import pandas as pd
from collections import defaultdict

import gensim.models

DATA_PATH = '/data/'
CAPTION_PATH = DATA_PATH + 'captions/'
RAW_CAPTION_PATH = CAPTION_PATH + 'raw/'

USE_STOPWORDS = False

if USE_STOPWORDS:
    ClEAN_CAPTION_PATH = CAPTION_PATH + 'clean_sw/'
    LEM_CAPTION_PATH = CAPTION_PATH + 'lem_sw/'
else:
    ClEAN_CAPTION_PATH = CAPTION_PATH + 'clean/'
    LEM_CAPTION_PATH = CAPTION_PATH + 'lem/'

cities = ['spb', 'moscow', 'nyc', 'london']
years = ['2016', '2017', '2018', '2019', '2020']
files = []
for city in cities:
    for year in years:
        files.append([city, year])

def csv_path(path, city, year):
    return path + city + '_posts_' + year + '.csv'


import fasttext

PRETRAINED_MODEL_PATH = '/data/source/lid.176.bin'
lang = fasttext.load_model(PRETRAINED_MODEL_PATH)
print('success')

success




This cell 
    1. replceces '\n', tab symbols and other space symbols to simple space
    2. removes duplicated spaces 
    3. removes all symbols, which aren't a letter or space (exception - numbers in a word - '8марта')
    4. defines language of posts
    5. removes posts without words

In [7]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [22]:
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces


MIN_DOCUMENT_SIZE = 5

filter_words = set()
if USE_STOPWORDS:
    # load stopwords from nltk
    filter_words = filter_words.union(set(stopwords.words('english')))
    filter_words = filter_words.union(set(stopwords.words('russian')))
    # load own stopwords
    with open('stopwords_full.txt') as f:
        filter_words = filter_words.union(set([word for line in f for word in line.split()]))


# input: document: str
# output: document: str
# make preprocessing of documents, remove useless symbols, remove useless words
def preprocessor(s: str):
    # all letter to lower case
    s = s.lower() 
    # remove \n and \xa0
    s = s.replace("\n", " ").replace('\xa0', ' ')
    # add space before all hashtags (some people write set of hashtags without spaces)
    s = s.replace('#',  ' #')
    s = s.replace('_', ' ')
    # remove all symbol, which aren't letter or num
    s = ''.join(filter(lambda c: c.isalpha() or c.isdigit() or c == '#' or c.isspace(), s))
    
    # replace all space symbol like space, tab, \n to simple space and remove double and for space
    s = strip_multiple_whitespaces(s)
    # remove empty words and words from filtered words 
    s = ' '.join(list(filter(lambda w:  len(w) > 0 and not (w in filter_words or w.isdigit()), s.split(' '))))
    return s


cities_counted_posts = dict(map(lambda x: [x, 0], cities))
start_time = time.time()
old_time = time.time()

for city in cities:
    for year in years:
        # load file and drop documents with empty text
        df = pd.read_csv(csv_path(RAW_CAPTION_PATH, city, year)).dropna()
        # preprocess of corpus
        df['caption'] = df['caption'].apply(preprocessor)
        # remove empty documents
        df = df[df['caption'].map(lambda t: len(t.split())) >= MIN_DOCUMENT_SIZE]
        
        languages = []
        for s in df['caption']:
            languages.append(lang.predict(s)[0][0])
        df['lang'] = languages
        
        cities_counted_posts[city] += len(df)
        
        # save results
        df.to_csv(r'' + csv_path(ClEAN_CAPTION_PATH, city, year), index=False)
        print(f"finished {city} {year}")
        
    print(f'finished {city}; time on stage: {time.time() - old_time}; counted posts: {cities_counted_posts[city]}')
    old_time = time.time()

print(f'completed with time: {time.time() - start_time}')

finished spb 2016
finished spb 2017
finished spb 2018
finished spb 2019
finished spb 2020
finished spb; time on stage: 2832.1211614608765; counted posts: 13319020
finished moscow 2016
finished moscow 2017
finished moscow 2018
finished moscow 2019
finished moscow 2020
finished moscow; time on stage: 4025.920019865036; counted posts: 16346499
finished nyc 2016
finished nyc 2017
finished nyc 2018
finished nyc 2019
finished nyc 2020
finished nyc; time on stage: 2517.5023498535156; counted posts: 20386445
finished london 2016
finished london 2017
finished london 2018
finished london 2019
finished london 2020
finished london; time on stage: 1131.6131975650787; counted posts: 9041301
completed with time: 10507.157339572906


Вместо леммитицазии всех докуентов поотдельности можно обработать весь текст, после составить словарь всех слов, леммитизировать все по одному разу и перезаписать текст постов

algorithm of lemmatize using dictionary of all words in dataset

In [23]:
# fast lemmatize
from collections import defaultdict
import time

start_time = time.time()
old_time = time.time()

d = defaultdict(int)
words_count = 0
valid_langs = set(['__label__ru', '__label__en'])

# load all words to dictionary
for city, year in files:
    df = pd.read_csv(csv_path(ClEAN_CAPTION_PATH, city, year)).dropna()
    df = df[df.lang.isin(valid_langs)]
    for text in df['caption']:
        words = text.split(' ')
        words_count += len(words)
        for word in words:
            d[word] += 1
    
    print(f'loaded words from {city} {year}')

print(f'dictionary size: {len(d)}, words count: {words_count}, time on stage: {time.time() - old_time}')
old_time = time.time()



# create lemmatized dictionary
d_lem = { word: ''.join(my_stem.lemmatize(word)[:-1]) for word in d } # [:-1] for remove \n after lemmatize
print(f'finish of lemmatize; time on stage: {time.time() - old_time}')
old_time = time.time()


# save dictionary 
if USE_STOPWORDS:
    f = open(DATA_PATH + 'dictionary_sw.txt', 'w')
else:
    f = open(DATA_PATH + 'dictionary.txt', 'w')

for word in d:
    f.write(word + ',' + d_lem[word] + ',' + str(d[word]) + '\n')
f.close()

print(f'finished saving of dictionary')

# replece all words in dataset to lemmatized words
for city, year in files:
    df = pd.read_csv(csv_path(ClEAN_CAPTION_PATH, city, year)).dropna()
    df = df[df.lang.isin(valid_langs)]
    df['caption'] = df['caption'].apply(lambda caption: ' '.join(list(map(lambda word: d_lem[word], caption.split()))))
    df.to_csv(r'' + csv_path(LEM_CAPTION_PATH, city, year), index=False)
    print(f'replaced words for {city} {year}')


print(f'finish; time on stage: {time.time() - old_time}; all time: {time.time() - start_time}')

loaded words from spb 2016
loaded words from spb 2017
loaded words from spb 2018
loaded words from spb 2019
loaded words from spb 2020
loaded words from moscow 2016
loaded words from moscow 2017
loaded words from moscow 2018
loaded words from moscow 2019
loaded words from moscow 2020
loaded words from nyc 2016
loaded words from nyc 2017
loaded words from nyc 2018
loaded words from nyc 2019
loaded words from nyc 2020
loaded words from london 2016
loaded words from london 2017
loaded words from london 2018
loaded words from london 2019
loaded words from london 2020
dictionary size: 34414437, words count: 2383693078, time on stage: 1221.226318359375
finish of lemmatize; time on stage: 1510.222708940506
finished saving of dictionary
replaced words for spb 2016
replaced words for spb 2017
replaced words for spb 2018
replaced words for spb 2019
replaced words for spb 2020
replaced words for moscow 2016
replaced words for moscow 2017
replaced words for moscow 2018
replaced words for moscow 20

In [11]:
unic_words = defaultdict(int)
words_count = 0
for w in d_lem:
    unic_words[d_lem[w]] += d[w]

print(len(unic_words))
print(len(d_lem))

23821976
25676956


In [15]:
sum = 0
for c in cities_counted_posts:
    sum += cities_counted_posts[c]
print(sum)

66280588


In [17]:
l = list(unic_words.items())   
l.sort(reverse=True, key=lambda item: item[1])


print(len(l))
l = list(filter(lambda item: item[1] >= 5, l))
print(len(l))


#for i in range(50):
#    print(i, l[i])

23821976
4779940


In [2]:
with open(DATA_PATH + 'dictionary.txt') as f:
    d_list = [[token for token in line.split(',')] for line in f]


d_lem = defaultdict(int)


for note in d_list:
    d_lem[note[1]] += int(note[2])

l = list(d_lem.items())   
l.sort(reverse=True, key=lambda item: item[1])

print(len(l))
l = list(filter(lambda item: item[1] > 5, l))
print(len(l))

for i in range(500):
    print(i, l[i])


28118041
4624285
0 ('и', 47123490)
1 ('в', 40782602)
2 ('the', 25321907)
3 ('на', 21778329)
4 ('and', 17582960)
5 ('не', 17461713)
6 ('to', 17247785)
7 ('с', 17046286)
8 ('a', 15060836)
9 ('я', 14611226)
10 ('что', 13346967)
11 ('быть', 12329507)
12 ('of', 11710607)
13 ('это', 10995701)
14 ('вы', 10988061)
15 ('in', 10742080)
16 ('i', 10707567)
17 ('все', 10513121)
18 ('по', 10127681)
19 ('для', 9605328)
20 ('for', 9599197)
21 ('а', 9162292)
22 ('мы', 9158646)
23 ('you', 9025828)
24 ('s', 7504245)
25 ('как', 7453340)
26 ('то', 6994410)
27 ('with', 6958631)
28 ('is', 6382674)
29 ('it', 6364832)
30 ('this', 6248305)
31 ('my', 6125078)
32 ('от', 5921143)
33 ('on', 5799243)
34 ('за', 5444764)
35 ('at', 5426803)
36 ('у', 5334559)
37 ('но', 5258720)
38 ('из', 5241492)
39 ('или', 5215885)
40 ('наш', 5213585)
41 ('к', 5121083)
42 ('свой', 4999094)
43 ('we', 4945826)
44 ('он', 4928514)
45 ('день', 4700193)
46 ('который', 4685810)
47 ('они', 4307775)
48 ('мой', 4113984)
49 ('год', 4067761)
50 ('

In [3]:
l = list(d_lem.items())   
l.sort(reverse=True, key=lambda item: item[1])

print(len(l))
l = list(filter(lambda item: item[1] >= 5, l))
print(len(l))

28118041
5313071


preprocessing for events

In [3]:
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum

EVENTS_RAW = DATA_PATH + 'events/raw/'
EVENTS_CLEANED = DATA_PATH + 'events/cleaned/'
               
useless_symbols = set(['_', 'ー'])

# input: document: str
# output: document: str
# make preprocessing of documents, remove useless symbols and numbers
def cleaner(s: str):
    # all letter to lower case
    s = s.lower() 
    
    # remove all symbol, which aren't letter or num
    s = strip_non_alphanum(s)
    # strip_non_alphanum don't remove '_' symbol
    s = ''.join(map(lambda c: ' ' if c in useless_symbols else c, s))

    # replace all space symbol like space, tab, \n to simple space and remove double and for space
    s = strip_multiple_whitespaces(s)
    # remove empty words and words from filtered words 
    s = ' '.join(filter(lambda w:  not w.isdigit(), s.split(' ')))
    return s


# load file and drop documents with empty text
df = pd.concat([pd.read_csv(EVENTS_RAW + 'spb_events.csv'), pd.read_csv(EVENTS_RAW + 'moscow_events.csv')])
# preprocess of corpus
df['description'] = df['captions'].apply(cleaner)

# save results
df.to_csv(r'' + EVENTS_CLEANED + 'events.csv', index=False)
print("success")

success


In [17]:
filter_words = set()
# load stopwords from nltk
filter_words = filter_words.union(set(stopwords.words('english')))
filter_words = filter_words.union(set(stopwords.words('russian')))
# load own stopwords
with open('stopwords_full.txt') as f:
    filter_words = filter_words.union(set([word for line in f for word in line.split()]))
    
df['description'] = df['description'].apply(lambda s: ' '.join(filter(lambda w: not w in filter_words, s.split())))
df.to_csv(r'' + EVENTS_PATH + 'events_clean_sw.csv', index=False)

In [21]:
with open(DATA_PATH + 'dictionary.txt') as f:
    d_list = [[token for token in line.split(',')] for line in f]

d_lem = defaultdict(str)
for note in d_list:
    d_lem[note[0]] = note[1]

def lemmatize(word: str):
    return d_lem[word] if word in d_lem else ''.join(my_stem.lemmatize(word))

df = pd.read_csv(EVENTS_PATH + 'events_clean_sw.csv')
df['description'] = df['description'].apply(lambda s: ' '.join(map(lemmatize, s.split())))
df.to_csv(r'' + EVENTS_PATH + 'events_lem_sw.csv', index=False)

df = pd.read_csv(EVENTS_PATH + 'events_clean.csv')
df['description'] = df['description'].apply(lambda s: ' '.join(map(lemmatize, s.split())))
df.to_csv(r'' + EVENTS_PATH + 'events_lem.csv', index=False)

In [24]:
df = pd.read_csv(EVENTS_PATH + 'events_clean.csv')
df['description'] = df['description'].apply(lambda s: ' '.join(map(lemmatize, s.split())))
df.to_csv(r'' + EVENTS_PATH + 'events_lem.csv', index=False)