In [18]:
# This Source Code Form is subject to the terms of the MPL
# License. If a copy of the same was not distributed with this
# file, You can obtain one at
# https://github.com/AkhilHector/pubundsci/blob/master/LICENSE.

import re
import sys
import nltk
import string
import numpy as np
import pandas as pd
from math import sqrt, log
from itertools import groupby
from nltk.collocations import *
from collections import Counter
from nltk.corpus import stopwords
from collections import defaultdict
from itertools import chain, product
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize as tokenize
from nltk.stem.porter import PorterStemmer as stemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

"""
m1 - The number of all word forms a text consists
m2 - The sum of the products of each observed frequency to the power of two
     and the number of word types observed with that frequency
"""

def compute_average_word_length(sentence):
    return np.mean([len(words) for words in sentence.split()])

def compute_average_sentence_length(sentence):
    sentence = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", sentence)
    return np.mean([len(words) for words in sentence])

def freq_of_words_great_sent_len(sentence):
    result = []
    avg_word_len = compute_average_word_length(sentence)
    # sentence = re.split("(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", sentence)
    sentence = Counter(sentence.split())
    for key, value in sentence.items():
        if len(key) > avg_word_len:
            result.append(value)
#             print (key, value)
    return sum(result)

def tokenize(sentence):
    return re.split(r"[^0-9A-Za-z\-'_]+", sentence)

def compute_yules_k_for_text(sentence):
    tokens = tokenize(sentence)
    counter = Counter(token.upper() for token in tokens)

    #compute number of word forms in a given sentence/text
    m1 = sum(counter.values())
    m2 = sum([frequency ** 2 for frequency in counter.values()])

    #compute yules k measure and return the value
    yules_k = 10000/((m1 * m1) / (m2 - m1))
    return yules_k


def words_in_sentence(sentence):
    w = [words.strip("0123456789!:,.?()[]{}") for words in sentence.split()]
    return filter(lambda x: len(x) > 0, w)

def compute_yules_i_for_text(sentence):
    dictionary = {}
    stemmer = PorterStemmer()

    for word in words_in_sentence(sentence):
        word = stemmer.stem(word).lower()
        try:
            dictionary[word] += 1
        except:
            dictionary[word] = 1

    m1 = float(len(dictionary))
    m2 = sum([len(list(grouped_values)) * (frequency ** 2) for frequency, grouped_values in groupby(sorted(dictionary.values()))])

    # compute yules i and return the value
    try:
        yules_i = (m1 * m1) / (m2 - m1)
        return yules_i
    except ZeroDivisionError:
        return 0

def compute_collocation_score(sentence_one, sentence_two, option):
    if option == "bi":
        tokens_for_one = nltk.wordpunct_tokenize(sentence_one)
        tokens_for_two = nltk.wordpunct_tokenize(sentence_two)
        finder_one = BigramCollocationFinder.from_words(tokens_for_one)
        finder_two = BigramCollocationFinder.from_words(tokens_for_two)
        result_one = finder_one.score_ngrams(nltk.collocations.BigramAssocMeasures().raw_freq)
        result_one = [(tuple(map(str.lower, values)), scores) for values, scores in result_one]
        result_two = finder_two.score_ngrams(nltk.collocations.BigramAssocMeasures().raw_freq)
        result_two = [(tuple(map(str.lower, values)), scores) for values, scores in result_two]
        matches = [keys for keys in set(result_one).intersection(set(result_two))]
        return len(matches)
    elif option == "tri":
        tokens_for_one = nltk.wordpunct_tokenize(sentence_one)
        tokens_for_two = nltk.wordpunct_tokenize(sentence_two)
        finder_one = TrigramCollocationFinder.from_words(tokens_for_one)
        finder_two = TrigramCollocationFinder.from_words(tokens_for_two)
        result_one = finder_one.score_ngrams(nltk.collocations.TrigramAssocMeasures().raw_freq)
        result_one = [(tuple(map(str.lower, values)), scores) for values, scores in result_one]
        result_two = finder_two.score_ngrams(nltk.collocations.TrigramAssocMeasures().raw_freq)
        result_two = [(tuple(map(str.lower, values)), scores) for values, scores in result_two]
        matches = [keys for keys in set(result_one).intersection(set(result_two))]
        return len(matches)
    else:
        return 0

def cosine_sim(sentence_a, sentence_b):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(tuple([' '.join(remove_stopwords(sentence_a))]) + tuple([' '.join(remove_stopwords(sentence_b))]))
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)[0][1]

def remove_stopwords(text):
    stemmer =  PorterStemmer()
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in [stemmer.stem(words) for words in tokenize(text)] if w.lower() not in stopwords]
    return content

def text_arrangement(text):
    result = []
    stemmer =  PorterStemmer()
    count_vectorizer = CountVectorizer(stop_words="english")
    
    for words in text.split():
        result.append(stemmer.stem(words))
    count_vectorizer.fit(result)
    return count_vectorizer.transform(result)
    
a = "my name is akhil"
b = "my name is akhil pandey"
print(remove_stopwords(a))
print(cosine_sim(a, b))

['name', 'akhil']
0.709297266606


# Read the dataframe using pandas

In [35]:
raw_data = pd.read_csv("DumpBlogs.csv")
raw_data = raw_data.sample(frac=1).reset_index(drop=True)
raw_data = raw_data.dropna(axis=0, how='any')
raw_data = raw_data.sample(frac=1).reset_index(drop=True)

# Cosine sim score on Abs and blog

In [53]:
raw_data = raw_data.assign(similarity_score = [cosine_sim(raw_data["abstract"][each], raw_data["blog_post"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

# Yules I Measure on Abstract

In [67]:
raw_data = raw_data.assign(yules_i_for_abs = [compute_yules_i_for_text(raw_data["abstract"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

# Yules I Measure on Blog

In [70]:
raw_data = raw_data.assign(yules_i_for_blg = [compute_yules_i_for_text(raw_data["blog_post"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

# Average word length on Abstract

In [73]:
raw_data = raw_data.assign(avg_word_len_abs = [compute_average_word_length(raw_data["abstract"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

# Average sentence length on Abstract

In [77]:
raw_data = raw_data.assign(avg_sen_len_abs = [compute_average_sentence_length(raw_data["abstract"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

# Frequency of words greater than avg word length in Abstract

In [79]:
raw_data = raw_data.assign(freq_of_words_great_sent_len_abs = [freq_of_words_great_sent_len(raw_data["abstract"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

# Average word length on Blog

In [81]:
raw_data = raw_data.assign(avg_word_len_blg = [compute_average_word_length(raw_data["blog_post"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


# Average sentence length on Blog

In [83]:
raw_data = raw_data.assign(avg_sen_len_blg = [compute_average_sentence_length(raw_data["blog_post"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

# Frequency of words greater than avg word length in Blog

In [85]:
raw_data = raw_data.assign(freq_of_words_great_sent_len_blg = [freq_of_words_great_sent_len(raw_data["blog_post"][each]) for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data

# Bigrams between abstract and blog

In [110]:
raw_data = raw_data.assign(bigrams_abs_blg = [compute_collocation_score(" ".join(remove_stopwords(raw_data["abstract"][each])), " ".join(remove_stopwords(raw_data["blog_post"][each])), "bi") for each in range(0, len(raw_data["altmetric_id"]))])
# raw_data.bigrams_abs_blg

# Trigrams between abstract and blog

In [111]:
raw_data = raw_data.assign(trigrams_abs_blg = [compute_collocation_score(" ".join(remove_stopwords(raw_data["abstract"][each])), " ".join(remove_stopwords(raw_data["blog_post"][each])), "tri") for each in range(0, len(raw_data["altmetric_id"]))])
raw_data.bigrams_abs_blg.unique()

array([ 0,  1, 20,  5, 26,  4,  2, 10, 23,  3, 18, 17, 24, 30, 19, 22, 11,
        7,  8,  6, 15, 25, 16, 13, 27, 12, 21, 29, 14,  9], dtype=int64)

# Write the Dataset to a CSV file

In [112]:
raw_data.to_csv("blogs_score_beta.csv", sep = ',', encoding="utf-8")