In [None]:
import nltk.corpus
import mysql_credits # Доступ к БД
import pymysql
import collections, re
import operator

import math
from textblob import TextBlob as tb
from pprint import pprint


# Коннектимся к божественному, бесплатному MySQL
connection = pymysql.connect(
    host=mysql_credits.db_host,
    user=mysql_credits.db_user,
    password=mysql_credits.db_password,
    db=mysql_credits.db,
    charset='utf8mb4',
    cursorclass=pymysql.cursors.DictCursor
)

stopwords = nltk.corpus.stopwords.words('russian')
for extra_word in ['г', 'года', 'это', 'году', 'http', 'https', 'лет', 'ул', 'д', 'www', 'г', 'ru', 'm', 'н']:
    stopwords.append(extra_word)
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50']
def remove_stop_word(string_to_make_clear):
    string_to_make_clear = string_to_make_clear.lower()
    signes = ['"', '«', '»', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+', ',', '.', '?', '/', '\\', '_', '=', '<', '>', '!', ':', ';', '|', "'", ',']
    for sign in signes:
        string_to_make_clear = string_to_make_clear.replace(sign, ' ')
    for number in numbers:
        string_to_make_clear = string_to_make_clear.replace(number, '')
    string_to_make_clear = ' '.join([word for word in string_to_make_clear.split() if word not in stopwords])
    return string_to_make_clear

def show_words_frequency():
    text_data = get_all_data_as_raw_text()
    bagsofwords = dict(collections.Counter(re.findall(r'\w+', articles_stopwords_removed)))
    bagsofwords = sorted(bagsofwords.items(), key=operator.itemgetter(1), reverse=True)
    j=0
    with open("bagofwords.csv", "a") as myfile:
        for words in bagsofwords:
            myfile.write(words[0]+';'+str(words[1])+'\n')
            print('Writing to file: ' + str(j)+"/"+str(len(bagsofwords)))
            j+=1

def get_all_data_as_raw_text():
    with connection.cursor() as cursor:
        all_articles_sql = 'SELECT article.article_text FROM article WHERE article_origin = "vk"'
        cursor.execute(all_articles_sql)
        articles = cursor.fetchall()
        articles_stopwords_removed = ''
        for article in articles:
            articles_stopwords_removed = articles_stopwords_removed + ' ' + remove_stop_word(article['article_text'])
    return articles_stopwords_removed


def get_list_of_all_articles_as_documents():
    with connection.cursor() as cursor:
        all_articles_sql = 'SELECT article.article_text FROM article WHERE article_origin = "vk"'
        cursor.execute(all_articles_sql)
        articles = cursor.fetchall()
        articles_stopwords_removed = []
        for article in articles:
            articles_stopwords_removed.append(
                tb(remove_stop_word(article['article_text']))
            )
        return articles_stopwords_removed
    
def get_text_data_by_years():
     with connection.cursor() as cursor:
        all_years_sql = 'SELECT YEAR(article.article_date) as `year` FROM article GROUP BY YEAR(article.article_date)'
        cursor.execute(all_years_sql)
        years = cursor.fetchall()
        articles_by_year = []
        for year in list((object['year'] for object in years)):
            all_articles_sql = 'SELECT article.article_text FROM article WHERE YEAR(article.article_date) = '+str(year) #+' LIMIT 10'
            cursor.execute(all_articles_sql)
            articles = cursor.fetchall()
            articles_stopwords_removed = ''
            for article in articles:
                articles_stopwords_removed = articles_stopwords_removed + ' ' + remove_stop_word(article['article_text'])
            temp = {}
            temp['year'] = year
            temp['articles'] = tb(articles_stopwords_removed)
            articles_by_year.append(temp)
        return(articles_by_year)
#         articles_stopwords_removed = []
#         for article in articles:
#             articles_stopwords_removed.append(
#                 tb(remove_stop_word(article['article_text']))
#             )
#         return articles_stopwords_removed





def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)


text_4_years = get_text_data_by_years()
bloblist = list((object['articles'] for object in text_4_years))
for blob in text_4_years:
    print("Топ слов в документе {}".format(blob['year']))
    scores = {word: tfidf(word, blob['articles'], bloblist) for word in blob['articles'].words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:10]:
        print("\tСлово: {}, TF-IDF: {}".format(word, round(score, 5)))


Топ слов в документе 2007
	Слово: фотки, TF-IDF: 0.05395
	Слово: аудиозаписи, TF-IDF: 0.03465
	Слово: бкз, TF-IDF: 0.03465
	Слово: нормальном, TF-IDF: 0.03465
	Слово: моему, TF-IDF: 0.02698
	Слово: состоянии, TF-IDF: 0.02425
	Слово: неодекватном, TF-IDF: 0.02273
	Слово: пиплы, TF-IDF: 0.02273
	Слово: хиде, TF-IDF: 0.02273
	Слово: мышакин, TF-IDF: 0.02273
Топ слов в документе 2008
	Слово: сдало, TF-IDF: 0.02258
	Слово: владом, TF-IDF: 0.02258
	Слово: июл, TF-IDF: 0.02258
	Слово: предложение, TF-IDF: 0.01566
	Слово: эмммм, TF-IDF: 0.01129
	Слово: vkontakt, TF-IDF: 0.01129
	Слово: картинг, TF-IDF: 0.01129
	Слово: привееет, TF-IDF: 0.01129
	Слово: правдо, TF-IDF: 0.01129
	Слово: гланцевые, TF-IDF: 0.01129
Топ слов в документе 2009
	Слово: ★, TF-IDF: 0.05011
	Слово: ”, TF-IDF: 0.02172
	Слово: ▀▄▀▄, TF-IDF: 0.01879
	Слово: ˜˜, TF-IDF: 0.01879
	Слово: °•°, TF-IDF: 0.01879
	Слово: комиксы, TF-IDF: 0.0179
	Слово: ▪, TF-IDF: 0.0179
	Слово: комикс, TF-IDF: 0.00836
	Слово: забили, TF-IDF: 0.00716
