# Install necessary dependencies

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Get Text text






We use the description of a very popular role-playing game (RPG) Skyrim from
Bethesda Softworks for summarization. 

In [15]:
text = """
Diligent students can be profoundly demotivated by group projects if they feel that their own success is dependent on team members who don’t do their share. One way to counteract the motivational hazards of group projects is to assess individual students’ learning and performance in addition to the group’s output. This strategy gives diligent students a greater sense of fairness and control and discourages free ridership. 

Individual learning and performance can be assessed in any number of ways. Some instructors add an individual component to group projects (e.g., a short essay, journal entries); some combine a group project with an individual test or quiz. Both group and individual performance are then reflected in the total project grade (e.g., some faculty members make the group grade worth 50% and the individual grade worth 50%; others split it 80%/20%. There’s no perfect breakdown, but the grading scheme should (a) reflect your goals for student learning and (b) seek to motivate the kind of work you want to see.
"""

In [16]:
import re

text = re.sub(r'\n|\r', ' ', text)
text = re.sub(r' +', ' ', text)
text = text.strip()

# Summarization with Gensim

Let’s look at an implementation of text summarization by leveraging Gensim’s
summarization module. It is pretty straightforward.

In [17]:
from gensim.summarization import summarize

print(summarize(text, ratio=0.2, split=False))



One way to counteract the motivational hazards of group projects is to assess individual students’ learning and performance in addition to the group’s output.


In [5]:
print(summarize(text, word_count=75, split=False))

The game's main story revolves around the player character's quest to defeat Alduin the World-Eater, a dragon who is prophesied to destroy the world.
Over the course of the game, the player completes quests and develops the character by improving skills.
The player may freely roam over the land of Skyrim which is an open world environment consisting of wilderness expanses, dungeons, cities, towns, fortresses, and villages.


In [18]:
sentences = nltk.sent_tokenize(text)
len(sentences)

7

This summarization implementation from Gensim is based on a variation of
a popular algorithm called TextRank. 

# Basic Text pre-processing

In [19]:
import numpy as np

stop_words = nltk.corpus.stopwords.words('english')

def normalize_text(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize text
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of text
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create text from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_text)

norm_sentences = normalize_corpus(sentences)
norm_sentences[:3]

array(['diligent students profoundly demotivated group projects feel success dependent team members dont share',
       'one way counteract motivational hazards group projects assess individual students learning performance addition groups output',
       'strategy gives diligent students greater sense fairness control discourages free ridership'],
      dtype='<U136')

lets appplyng for vectorization on given text

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
dt_matrix = tv.fit_transform(norm_sentences)
dt_matrix = dt_matrix.toarray()

vocab = tv.get_feature_names()
td_matrix = dt_matrix.T
print(td_matrix.shape)
pd.DataFrame(np.round(td_matrix, 2), index=vocab).head(10)

(71, 7)




Unnamed: 0,0,1,2,3,4,5,6
add,0.0,0.0,0.0,0.0,0.26,0.0,0.0
addition,0.0,0.29,0.0,0.0,0.0,0.0,0.0
assess,0.0,0.29,0.0,0.0,0.0,0.0,0.0
assessed,0.0,0.0,0.0,0.48,0.0,0.0,0.0
breakdown,0.0,0.0,0.0,0.0,0.0,0.0,0.26
combine,0.0,0.0,0.0,0.0,0.26,0.0,0.0
component,0.0,0.0,0.0,0.0,0.26,0.0,0.0
control,0.0,0.0,0.31,0.0,0.0,0.0,0.0
counteract,0.0,0.29,0.0,0.0,0.0,0.0,0.0
demotivated,0.3,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
from scipy.sparse.linalg import svds
    
def low_rank_svd(matrix, singular_count=2):
    u, s, vt = svds(matrix, k=singular_count)
    return u, s, vt

In [22]:
num_sentences = 8
num_topics = 3

u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  
print(u.shape, s.shape, vt.shape)
term_topic_mat, singular_values, topic_text_mat = u, s, vt

(71, 3) (3,) (3, 7)


In [23]:
# remove singular values below threshold                                         
sv_threshold = 0.5
min_sigma_value = max(singular_values) * sv_threshold
singular_values[singular_values < min_sigma_value] = 0

In [24]:
salience_scores = np.sqrt(np.dot(np.square(singular_values), 
                                 np.square(topic_text_mat)))
salience_scores

array([0.72750314, 0.65507968, 0.7881476 , 0.70645741, 0.7088736 ,
       0.70132378, 0.76470579])

In [25]:
top_sentence_indices = (-salience_scores).argsort()[:num_sentences]
top_sentence_indices.sort()

In [26]:
print('\n'.join(np.array(sentences)[top_sentence_indices]))

Diligent students can be profoundly demotivated by group projects if they feel that their own success is dependent on team members who don’t do their share.
One way to counteract the motivational hazards of group projects is to assess individual students’ learning and performance in addition to the group’s output.
This strategy gives diligent students a greater sense of fairness and control and discourages free ridership.
Individual learning and performance can be assessed in any number of ways.
Some instructors add an individual component to group projects (e.g., a short essay, journal entries); some combine a group project with an individual test or quiz.
Both group and individual performance are then reflected in the total project grade (e.g., some faculty members make the group grade worth 50% and the individual grade worth 50%; others split it 80%/20%.
There’s no perfect breakdown, but the grading scheme should (a) reflect your goals for student learning and (b) seek to motivate t