In [1]:
import os
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
from collections import defaultdict
import re
import pymorphy2

import json
import pickle

import logging

In [2]:
logging.basicConfig(level=logging.DEBUG)

In [3]:
def multipleReSub(text, patternL, replL):
    for i, pattern in enumerate(patternL):
        text = re.sub(pattern, replL[i], text)
    return text

In [4]:
def preProcessing(plainText):
    ## тут списки общих реплэйсов, DONT FORGET TO UPDATE
    patternL = ["Janice", "Джои"]
    replL = ["Дженис", "Джоуи"]
    plainText = multipleReSub(plainText, patternL, replL)
    plainText = re.sub("[^А-Яа-яЁё\s]", "", plainText)
    plainText = re.sub("[\n\t]", ' ', plainText)
    plainText = re.sub(r"(\S)(\s+)(\S)", r"\1 \3", plainText)
    return plainText.lower()

In [5]:
def fileCollector(d=defaultdict(list)):
    morph = pymorphy2.MorphAnalyzer()
    root = os.path.join(os.getcwd(), "friends-data")
    for seasonFolder in os.listdir(root):
        logging.info("folder processing: %s" % (seasonFolder))
        rootPlus = os.path.join(root, seasonFolder)
        epList = os.listdir(rootPlus)
        epListLength = len(epList)
        for i, episodeTxt in enumerate(epList):
            logging.debug("file processing: %d / %d" % (i, epListLength))
            with open(os.path.join(root, seasonFolder, episodeTxt), 'r', encoding='utf-8') as f:
                plain = f.read()
                plainList = preProcessing(plain).split()
                spacedLemmasStr = ' '.join([morph.parse(word)[0].normal_form for word in plainList])
                epTitle = 'e' + episodeTxt.split('-')[1].strip()
                d[epTitle] = spacedLemmasStr
    return d

In [6]:
vectorizer = CountVectorizer()

## key = episode id, value = str->lemmas
basicDict = fileCollector()

INFO:pymorphy2.opencorpora_dict.wrapper:Loading dictionaries from C:\users\admin\anaconda3\lib\site-packages\pymorphy2_dicts\data
INFO:pymorphy2.opencorpora_dict.wrapper:format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168
INFO:root:folder processing: Friends - season 1
DEBUG:root:file processing: 0 / 20
DEBUG:root:file processing: 1 / 20
DEBUG:root:file processing: 2 / 20
DEBUG:root:file processing: 3 / 20
DEBUG:root:file processing: 4 / 20
DEBUG:root:file processing: 5 / 20
DEBUG:root:file processing: 6 / 20
DEBUG:root:file processing: 7 / 20
DEBUG:root:file processing: 8 / 20
DEBUG:root:file processing: 9 / 20
DEBUG:root:file processing: 10 / 20
DEBUG:root:file processing: 11 / 20
DEBUG:root:file processing: 12 / 20
DEBUG:root:file processing: 13 / 20
DEBUG:root:file processing: 14 / 20
DEBUG:root:file processing: 15 / 20
DEBUG:root:file processing: 16 / 20
DEBUG:root:file processing: 17 / 20
DEBUG:root:file processing: 18 / 20
DEBUG:root:file processing: 19 / 20
INFO:

In [7]:
with open('basicDict.pickle', 'wb') as f:
    pickle.dump(basicDict, f)

#with open('basicDict.pickle', 'rb') as f:
#    basicDict = pickle.load(f)

In [8]:
episodes = list(basicDict.keys())
corpus = [basicDict[ep] for ep in episodes]

In [9]:
X = vectorizer.fit_transform(corpus)

In [None]:
#matrix_freq = np.asarray(X.sum(axis=0)).ravel()
#final_matrix = np.array([np.array(vectorizer.get_feature_names()), matrix_freq])

In [10]:
friendsDF = pd.DataFrame(data=X.toarray(),
                         columns=vectorizer.get_feature_names(),
                         index=episodes)


In [11]:
friendsDF

Unnamed: 0,аа,ааа,аааа,ааааа,ааааааа,аааааау,аарон,аба,аббатство,абонемент,...,ящичек,ёй,ёкнуть,ёлка,ёлочный,ёпэрэсотэ,ёрл,ёрш,ёршик,ёще
e1x01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e1x02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e1x03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e1x04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e1x05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
e7x20,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e7x21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e7x22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
e7x23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
globalDict = defaultdict(list)
letterFlag = ''

for word, row in friendsDF.transpose().iterrows():
    totalNumber = row.sum()
    textList = []
    firstLetter = word[0]
    if letterFlag != firstLetter:
        letterFlag = firstLetter
        amount = len(globalDict)
        logging.debug("now processing: letter \"{0}\". words processed: {1}".format(letterFlag, amount))
    for episode in episodes:
        ## interrows() transfers int to float64,
        ## which may result lead to 0 becoming 0.000000001
        if row[episode] >= 1:
            textList.append(basicDict[episode])
    globalDict[word] = [textList, int(totalNumber)] ## для джсон

DEBUG:root:now processing: letter "а". words processed: 0
DEBUG:root:now processing: letter "б". words processed: 279
DEBUG:root:now processing: letter "в". words processed: 918
DEBUG:root:now processing: letter "г". words processed: 1936
DEBUG:root:now processing: letter "д". words processed: 2363
DEBUG:root:now processing: letter "е". words processed: 3082
DEBUG:root:now processing: letter "ж". words processed: 3130
DEBUG:root:now processing: letter "з". words processed: 3270
DEBUG:root:now processing: letter "и". words processed: 3885
DEBUG:root:now processing: letter "й". words processed: 4182
DEBUG:root:now processing: letter "к". words processed: 4197
DEBUG:root:now processing: letter "л". words processed: 5148
DEBUG:root:now processing: letter "м". words processed: 5521
DEBUG:root:now processing: letter "н". words processed: 6198
DEBUG:root:now processing: letter "о". words processed: 7080
DEBUG:root:now processing: letter "п". words processed: 8031
DEBUG:root:now processing: le

In [22]:
with open('dictionary.pickle', 'wb') as f:
    pickle.dump(globalDict, f)

with open('dictionary.json', 'w', encoding='utf-8') as f:
    json.dump(globalDict, f)
    
friendsDF.to_pickle(path='matrix.pickle')
friendsDF.to_json(path_or_buf='matrix.json', orient="split")

In [29]:
maxFreq = -1
minFreq = -1
maxFreqWord = ""
minFreqWord = ""

for word in globalDict:
    occurrence = globalDict[word][1]
    if maxFreq == -1:
        maxFreq = occurrence
        maxFreqWord = word
    else:
        if maxFreq <= occurrence:
            maxFreq = occurrence
            maxFreqWord = word      
    if minFreq == -1:
        minFreq = occurrence
        minFreqWord = word
    else:
        if minFreq >= occurrence:
            minFreq = occurrence
            minFreqWord = word

In [30]:
print("a) самое частотное слово:\t{0}, {1} вхождений".format(maxFreqWord, maxFreq))
print("б) самое редкое слово:\t{0}, {1} вхождений".format(minFreqWord, minFreq))


a) самое частотное слово:	ты, 10970 вхождений
б) самое редкое слово:	ёще, 1 вхождений


In [33]:
amountOfDocs = len(friendsDF.index)
inEveryDocPresent = []

for word in globalDict:
    if len(globalDict[word][0]) == amountOfDocs:
        inEveryDocPresent.append(word)

In [34]:
print("c) какой набор слов есть во всех документах коллекции")
print("\t"+ "\n\t".join(inEveryDocPresent))

c) какой набор слов есть во всех документах коллекции
	быть
	весь
	да
	думать
	если
	ещё
	знать
	как
	мой
	мочь
	мы
	на
	не
	нет
	но
	ну
	он
	она
	просто
	так
	такой
	тот
	ты
	хотеть
	что
	это
	этот


In [36]:
morph = pymorphy2.MorphAnalyzer()
characters = [
    "Моника",
    "Рэйчел",
    "Чендлер",
    "Фиби",
    "Росс",
    "Джоуи"
]

lemmaCharacters = [morph.parse(name.lower())[0].normal_form for name in characters]

nMaxFreq = 0
nMaxName = ""
for name in lemmaCharacters:
    freq = globalDict[name][1]
    if nMaxFreq < freq:
        nMaxFreq = freq
        nMaxName = name
        
print("d) кто из главных героев статистически самый популярный?")
print(nMaxName, freq)

INFO:pymorphy2.opencorpora_dict.wrapper:Loading dictionaries from C:\users\admin\anaconda3\lib\site-packages\pymorphy2_dicts\data
INFO:pymorphy2.opencorpora_dict.wrapper:format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


d) кто из главных героев статистически самый популярный?
росс 745
