# Text Summarization API
## Call the method "text_summarization(title, content)

In [None]:
import re

class SummaryTool(object):

    # Naive method for splitting a text into sentences
    def split_content_to_sentences(self, content):
        content = content.replace("\n", ". ")
        return content.split(". ")

    # Naive method for splitting a text into paragraphs
    def split_content_to_paragraphs(self, content):
        return content.split("\n\n")

    # Caculate the intersection between 2 sentences
    def sentences_intersection(self, sent1, sent2):

        # split the sentence into words/tokens
        s1 = set(sent1.split(" "))
        s2 = set(sent2.split(" "))

        # If there is not intersection, just return 0
        if (len(s1) + len(s2)) == 0:
            return 0

        # We normalize the result by the average number of words
        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)

    # Format a sentence - remove all non-alphbetic chars from the sentence
    # We'll use the formatted sentence as a key in our sentences dictionary
    def format_sentence(self, sentence):
        sentence = re.sub(r'\W+', '', sentence)
        return sentence

    # Convert the content into a dictionary <K, V>
    # k = The formatted sentence
    # V = The rank of the sentence
    def get_sentence_ranks(self, content):

        # Split the content into sentences
        sentences = self.split_content_to_sentences(content)

        # Calculate the intersection of every two sentences
        n = len(sentences)
        values = [[0 for x in range(n)] for x in range(n)]
        for i in range(0, n):
            for j in range(0, n):
                values[i][j] = self.sentences_intersection(sentences[i], sentences[j])

        # Build the sentences dictionary
        # The score of a sentences is the sum of all its intersection
        sentences_dic = {}
        for i in range(0, n):
            score = 0
            for j in range(0, n):
                if i == j:
                    continue
                score += values[i][j]
            sentences_dic[self.format_sentence(sentences[i])] = score
        return sentences_dic

    # Return the best sentence in a paragraph
    def get_best_sentence(self, paragraph, sentences_dic):

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(paragraph)

        # Ignore short paragraphs
        if len(sentences) < 2:
            return ""

        # Get the best sentence according to the sentences dictionary
        best_sentence = ""
        max_value = 0
        for s in sentences:
            strip_s = self.format_sentence(s)
            if strip_s:
                if sentences_dic[strip_s] > max_value:
                    max_value = sentences_dic[strip_s]
                    best_sentence = s

        return best_sentence

    # Build the summary
    def get_summary(self, title, content, sentences_dic):

        # Split the content into paragraphs
        paragraphs = self.split_content_to_paragraphs(content)

        # Add the title
        summary = []
        summary.append(title.strip())
        summary.append("")

        # Add the best sentence from each paragraph
        for p in paragraphs:
            sentence = self.get_best_sentence(p, sentences_dic).strip()
            if sentence:
                summary.append(sentence)

        return ("\n").join(summary)


def text_summarization(title, content):
    # Create a SummaryTool object
    st = SummaryTool()

    # Build the sentences dictionary
    sentences_dic = st.get_sentence_ranks(content)

    # Build the summary with the sentences dictionary
    summary = st.get_summary(title, content, sentences_dic)

    # Print the summary
    #print (summary)
    return summary
    
    # Print the ratio between the summary length and the original length
    #print ("")
    #print ("Original Length %s" % (len(title) + len(content)))
    #print ("Summary Length %s" % len(summary))
    #print ("Summary Ratio: %s" % (100 - (100 * (len(summary) / (len(title) + len(content))))))

    
title = 'doc1'
summary = """Take a low budget; inexperienced actors doubling as production staff?? as well as limited facilities??and you can't expect much more than "Time Chasers" gives you; but you can absolutely expect a lot less. This film represents a bunch of good natured friends and neighbors coming together to collaborate on an interesting project. If your cousin had been one of those involved; you would probably think to yourself; "ok; this movie is terrible... but a really good effort." For all the poorly delivered dialog and ham-fisted editing; "Time Chasers" has great scope and ambition... and one can imagine it was necessary to shoot every scene in only one or two takes. So; I'm suggesting people cut "Time Chasers" some slack before they cut in the jugular. That said; I'm not sure I can ever forgive the pseudo-old lady from the grocery store for the worst delivery every wrenched from the jaws of a problematic script.
Having seen this movie more often than all others; and one of those you will never forget the theater in which you saw it the first time; it undoubtedly is Doris Day's best. I am one of many who also feel she deserved an academy award for her portrayal of Ruth Etting. With her tremendous voice; and the acting skills she demonstrated in this role; one can only imagine what she would have done had she been one of the MGM stars; when their musicals were known as the best. I understand she came within a hair of being cast in South Pacific; but as has been rumored; it was either Marty Melcher's insistence on a much higher salary; or a clash between Doris or Marty with the director. From reading her biography; it is very apparent that she wasn't one of your typical Hollywood stars. With these typical stars and directors; as well as most everyone else associated with the AMPAS being those who decide who be will be nominated; as well as ultimately winning; Ms. Day's choice of living a less typical Hollywood life style may have had some bearing on her not being nominated for Love Me or Leave Me. I definitely believe this is the reason she has never been given a Lifetime Achievement Award from the Academy of Motion Picture Arts and Sciences. Let us hope this award will be made before this gracious and lovely lady is unable to make an appearance there.
"""

result = text_summarization(title, summary)
print(result)

# Training the Sentiment Analysis Model (using Random Forest Algorithm)
## Saved File name: "MT_Trained_Sentiment_Model_Imdb.sav"

In [None]:
%%time
import numpy as np
import pandas as pd
import pickle
import re

# inputs
_train_data_csv = 'train_imdb.csv'
filename = 'MT_Trained_Sentiment_Model_Imdb.sav'

train_data = pd.read_csv(_train_data_csv)

def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)
    
def process_data(i_data):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",i_data.lower()).split())

train_data['processed_review'] = train_data['review'].apply(process_data)

drop_features(['review'],train_data)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data["processed_review"],train_data["label"], test_size = 0.2, random_state = 42)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200)
model.fit(x_train_tfidf,y_train)

#predictions = model.predict(x_test_tfidf)

#from sklearn.metrics import confusion_matrix,f1_score
#confusion_matrix(y_test,predictions)

#train_counts = count_vect.fit_transform(train_data['processed_review'])
#train_tfidf = transformer.fit_transform(train_counts)

#final_model = model.fit(train_tfidf,train_data['label'])

pickle.dump(model, open(filename, 'wb'))

## Testing the Sentiment Analysis

In [None]:
%%time
import pickle
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)
    
def process_data(i_data):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",i_data.lower()).split())

from textblob import TextBlob
def model_predict(test_data):
    p = []
    for x in test_data['review']:
        blob = TextBlob(x)
        p.append( "0" if blob.sentiment.polarity < 0 else "1")
    return p

def calculate_sentiment(test_data):
    _train_data_csv = 'train_imdb.csv'
    train_data = pd.read_csv(_train_data_csv)
    train_data['processed_review'] = train_data['review'].apply(process_data)

    drop_features(['review'],train_data)

    x_train, x_test, y_train, y_test = train_test_split(train_data["processed_review"],train_data["label"], test_size = 0.2, random_state = 42)
    count_vect = CountVectorizer(stop_words='english')
    transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
    x_train_counts = count_vect.fit_transform(x_train)
    x_train_tfidf = transformer.fit_transform(x_train_counts)
    x_test_counts = count_vect.transform(x_test)
    x_test_tfidf = transformer.transform(x_test_counts)


    loaded_model = pickle.load(open('MT_Trained_Sentiment_Model_Imdb.sav', 'rb'))
    #test_data = pd.read_csv('Testing_data_MT.csv')
    test_data['processed_review'] = test_data['review'].apply(process_data)
    test_counts = count_vect.transform(test_data['processed_review'])
    test_tfidf = transformer.transform(test_counts)

    predictions = loaded_model.predict(test_tfidf)
    #predictions = model_predict(test_data)
    test_data['label'] = predictions
    final_result = pd.DataFrame({'id':test_data['id'],'review':test_data['processed_review'],'label':predictions})
    final_result.to_csv('output.csv',index=False)
    return final_result
#final_result

test_data = pd.read_csv('test_test_imdb_utf8.csv')
result2 = calculate_sentiment(test_data)
#print(result2)

# My REST Service API modules 

In [None]:
import pickle
import pandas as pd
import re
import json
from pandas import DataFrame

from flask import Flask
from flask import request
from flask import jsonify
from flask_cors import CORS

app = Flask(__name__)
CORS(app)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)
    
def process_data(i_data):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",i_data.lower()).split())

from textblob import TextBlob
def model_predict(test_data):
    p = []
    for x in test_data['review']:
        blob = TextBlob(x)
        p.append( "0" if blob.sentiment.polarity < 0 else "1")
    return p

#@app.route('/sentiment/<string:test_csv>')
@app.route('/sentiment', methods=['POST'])
def calculate_sentiment():
    #test_data = pd.read_csv(test_csv)
    json_data = request.get_json()
    
    test_data = DataFrame(json_data)
    
    test_data = test_data.rename(columns={'answerid': 'id', 'atxt': 'review'})
    
    _train_data_csv = 'train_imdb.csv'
    train_data = pd.read_csv(_train_data_csv)
    train_data['processed_review'] = train_data['review'].apply(process_data)

    drop_features(['review'],train_data)

    x_train, x_test, y_train, y_test = train_test_split(train_data["processed_review"],train_data["label"], test_size = 0.2, random_state = 42)
    count_vect = CountVectorizer(stop_words='english')
    transformer = TfidfTransformer(norm='l2',sublinear_tf=True)
    x_train_counts = count_vect.fit_transform(x_train)
    x_train_tfidf = transformer.fit_transform(x_train_counts)
    x_test_counts = count_vect.transform(x_test)
    x_test_tfidf = transformer.transform(x_test_counts)


    loaded_model = pickle.load(open('MT_Trained_Sentiment_Model_Imdb.sav', 'rb'))
    #test_data = pd.read_csv('Testing_data_MT.csv')
    test_data['processed_review'] = test_data['review'].apply(process_data)
    test_counts = count_vect.transform(test_data['processed_review'])
    test_tfidf = transformer.transform(test_counts)

    predictions = loaded_model.predict(test_tfidf)
    predictions = model_predict(test_data)
    
    test_data['polarity'] = predictions
    
    final_result = test_data.rename(columns={'id': 'answerid', 'review': 'atxt'})
    #final_result = pd.DataFrame({'answerid':test_data['id'],'polarity':predictions})
    result_json = final_result.to_json(orient='records')
    
    return jsonify(result_json)


@app.route('/')
def testing():
     return 'My M.Tech REST API Service is active...'
    
    
class SummaryTool(object):

    # Naive method for splitting a text into sentences
    def split_content_to_sentences(self, content):
        content = content.replace("\n", ". ")
        return content.split(". ")

    # Naive method for splitting a text into paragraphs
    def split_content_to_paragraphs(self, content):
        return content.split("\n\n")

    # Caculate the intersection between 2 sentences
    def sentences_intersection(self, sent1, sent2):

        # split the sentence into words/tokens
        s1 = set(sent1.split(" "))
        s2 = set(sent2.split(" "))

        # If there is not intersection, just return 0
        if (len(s1) + len(s2)) == 0:
            return 0

        # We normalize the result by the average number of words
        return len(s1.intersection(s2)) / ((len(s1) + len(s2)) / 2)

    # Format a sentence - remove all non-alphbetic chars from the sentence
    # We'll use the formatted sentence as a key in our sentences dictionary
    def format_sentence(self, sentence):
        sentence = re.sub(r'\W+', '', sentence)
        return sentence

    # Convert the content into a dictionary <K, V>
    # k = The formatted sentence
    # V = The rank of the sentence
    def get_sentence_ranks(self, content):

        # Split the content into sentences
        sentences = self.split_content_to_sentences(content)

        # Calculate the intersection of every two sentences
        n = len(sentences)
        values = [[0 for x in range(n)] for x in range(n)]
        for i in range(0, n):
            for j in range(0, n):
                values[i][j] = self.sentences_intersection(sentences[i], sentences[j])

        # Build the sentences dictionary
        # The score of a sentences is the sum of all its intersection
        sentences_dic = {}
        for i in range(0, n):
            score = 0
            for j in range(0, n):
                if i == j:
                    continue
                score += values[i][j]
            sentences_dic[self.format_sentence(sentences[i])] = score
        return sentences_dic

    # Return the best sentence in a paragraph
    def get_best_sentence(self, paragraph, sentences_dic):

        # Split the paragraph into sentences
        sentences = self.split_content_to_sentences(paragraph)

        # Ignore short paragraphs
        if len(sentences) < 2:
            return ""

        # Get the best sentence according to the sentences dictionary
        best_sentence = ""
        max_value = 0
        for s in sentences:
            strip_s = self.format_sentence(s)
            if strip_s:
                if sentences_dic[strip_s] > max_value:
                    max_value = sentences_dic[strip_s]
                    best_sentence = s

        return best_sentence

    # Build the summary
    def get_summary(self, title, content, sentences_dic):

        # Split the content into paragraphs
        paragraphs = self.split_content_to_paragraphs(content)

        # Add the title
        summary = []
        summary.append(title.strip())
        summary.append("")

        # Add the best sentence from each paragraph
        for p in paragraphs:
            sentence = self.get_best_sentence(p, sentences_dic).strip()
            if sentence:
                summary.append(sentence)

        return ("\n").join(summary)

#@app.route('/summary/<string:content>')
@app.route('/summary', methods=['POST'])
def text_summarization():
    # Create a SummaryTool object
    json_data = request.get_json()
    #print('JSON_DATA: ',json_data)
    input_data = DataFrame(json_data)
        
    
    temp = pd.read_csv('clusters_output.csv')
    test_data = temp[temp['Name'] == input_data.Name[0]]

    content = ""
    for index, row in test_data.iterrows():
        content = content + row.Sentences

    st = SummaryTool()
    
    #Build a sentence dictionary
    sentences_dic = st.get_sentence_ranks(content)
    title = ""
    # Build the summary with the sentences dictionary
    summary = st.get_summary(title, content, sentences_dic)
    
        
    return jsonify(summary)
    
    #print(test_data)
    #result_json = test_data.to_json(orient='records')
    #return jsonify(result_json)




from textblob import TextBlob
from textblob import Word


@app.route('/detectTopic', methods=['POST'])
def detect_topic():
    
    json_data = request.get_json()
    input_data = DataFrame(json_data)
    input_data = input_data.rename(columns={'answerid': 'id', 'atxt': 'review'})

    df = pd.DataFrame(columns=['Name', 'Sentences', 'Summary', 'Name_Count'],dtype=int)
    
    for index, row in input_data.iterrows():
        
        blob = TextBlob(row.review)
        for sentence in blob.sentences:
            s_sent = str(sentence)
            nouns = list()
            for word, tag in sentence.tags:
                if tag == 'NN':
                    nouns.append(word.lemmatize())

            for n in nouns:
                n_plrl = Word(n).pluralize()
                my_df_list = df.index[df['Name'] == n_plrl].tolist()
                if(len(my_df_list) == 0):
                    df.loc[len(df)] = [n_plrl, s_sent, '', 1]
                else:
                    i = int(my_df_list[0])
                    df.Name_Count[i] = df.Name_Count[i] + 1
                    df.Sentences[i] = df.Sentences[i] + '. ' +s_sent 
                    
    result_json = df.to_json(orient='records')
    return jsonify(result_json)


import re, math
from collections import Counter

WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)


@app.route('/generateCluster', methods=['POST'])
def generate_cluster():
    
    json_data = request.get_json()
    df = DataFrame(json_data)
    
    threshold_value = 0.60
    w, h = len(df), len(df)
    Cosine_Matrix = [[0 for x in range(w)] for y in range(h)] 

    for index1, row1 in df.iterrows():
        for index2, row2 in df.iterrows():
            vector1 = text_to_vector(row1.Sentences)
            vector2 = text_to_vector(row2.Sentences)

            cosine = get_cosine(vector1, vector2)
            Cosine_Matrix[index1][index2] = cosine

    #cm = pd.DataFrame(Cosine_Matrix)      
    #cm.to_csv('cosine_output.csv')
    #cm
    cluster_df = pd.DataFrame(columns=['Name', 'Sentences', 'Summary', 'Name_Count'],dtype=int)

    for index1, row1 in df.iterrows():
        name = row1.Name
        sentences = row1.Sentences
        name_count = row1.Name_Count
        for index2, row2 in df.iterrows():
            if index1 != index2:
                if Cosine_Matrix[index1][index2] >= threshold_value:
                    name = name +', '+ row2.Name
                    sentences = sentences +'. '+ row2.Sentences
                    name_count = name_count + 1

        if name != row1.Name:
            #data = name.split(",")
            data = [x.strip() for x in name.split(',')]
            data.sort()
            name = ', '.join(data)
            cluster_df.loc[len(cluster_df)] =[name, sentences, '', name_count]

    # sorting by first name 
    cluster_df.sort_values("Name", inplace = True) 

    # dropping ALL duplicte values 
    cluster_df.drop_duplicates(subset ="Name", keep = False, inplace = True) 
    
    frames = [df, cluster_df]
    combined_result = pd.concat(frames)
    combined_result.to_csv('clusters_output.csv', encoding='utf-8')

    
    result_json = cluster_df.to_json(orient='records')
    return jsonify(result_json)
    


    

app.run(port=8081)

 * Running on http://127.0.0.1:8081/ (Press CTRL+C to quit)
127.0.0.1 - - [14/Nov/2018 19:20:01] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:21:13] "[37mPOST /sentiment HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:22:39] "[37mOPTIONS /sentiment HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:22:57] "[37mPOST /sentiment HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:23:22] "[37mOPTIONS /detectTopic HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:23:24] "[37mPOST /detectTopic HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:23:24] "[37mOPTIONS /generateCluster HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:23:24] "[37mPOST /generateCluster HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:23:43] "[37mOPTIONS /summary HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:23:43] "[37mPOST /summary HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:23:51] "[37mOPTIONS /summary HTTP/1.1[0m" 200 -
127.0.0.1 - - [14/Nov/2018 19:23:51] "[37mPOST /summary