<a href="https://colab.research.google.com/github/ajaythakur3369/Suvidha-Foundation-Internship/blob/main/Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Project Name - Text Summarization**
## **Developed By - Ajay Thakur (ajaythakur3369@gmail.com)**
## **Branch Name - Electronics and Communication Engineering**
## **Institute Name - Indian Institute of Information Technology Kota**
## **Submitted To - Suvidha Foundation (Suvidha Mahila Mandal)**
## **Project Link (GitHub) - [Click here](https://github.com/ajaythakur3369/Suvidha-Foundation-Internship)**

# **Let's Begin !**

### **Import Libraries**

In [None]:
# Imported all necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import os
import string
from pickle import dump, load

### **Loading the Data**

In [None]:
class LoadData:
    def __init__(self, directory):
        self.directory = directory

    def load_story(self, filename):
        # load document
        with open(filename, 'r') as file:
            text = file.read()
        # split into story and highlights
        if '@highlight' in text:
            story, highlights = text.split('@highlight', 1)
            highlights = [h.strip() for h in highlights.split('\n') if len(h) > 0]
        else:
            story = text
            highlights = []
        return story, highlights

    def load_stories(self):
        stories = []
        list_of_files = os.listdir(self.directory)
        for name in list_of_files[:8]:
            filename = os.path.join(self.directory, name)
            story, highlights = self.load_story(filename)
            stories.append((story, highlights))
        return stories

In [None]:
# Access the drive from Colab to access all the required files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Path for accessing all the required files from Drive
DIR_PATH = "/content/drive/MyDrive/Colab_Notebook/Internship_Name/Suvidha_Foundation/Folder_Name/CNN_stories_dataset"

In [None]:
obj = LoadData(DIR_PATH)
stories = obj.load_stories()

In [None]:
# Accessing the highlights of the 8th story (index 7)
print(stories[7][1])
print()

# Accessing the story of the 8th story (index 7)
print(stories[7][0])

['Pamela Anderson becomes joint owner of a racing team', '@highlight', 'Former Baywatch star is fronting the Race Alliance FIA GT Series team', '@highlight', 'Vitantonio Liuzzi and Mathias Lauda drive for the team', '@highlight', 'Nine-time rally world champion Sebastian Loeb runs a competing team']

-LRB- CNN -RRB- -- Pamela Anderson is set to bring a little Hollywood glamor to motorsport after becoming part owner of a sports car racing team .

The former Baywatch star has linked up with the Race Alliance team , which is competing in the inaugural FIA GT Series .

The Playboy pin-up ventured into motorsport in March 2012 when her and occasional racing driver Markus Fux fronted the Downforce1 European Le Mans teams .

Following the failure of that venture Fux and Anderson have teamed up with the Race Alliance team for the final three races of the GT Series ' 2013 season .

`` We want to make Race Alliance a recognizable name and have looked for the best drivers possible , '' a spokesma

In [None]:
stories[:2]

[("-LRB- CNN -RRB- -- As investigators are determining whether a skull and bones discovered behind an abandoned Central Virginia home on Saturday are those of Hannah Graham , the lone suspect in her disappearance has been indicted in an assault from almost a decade ago -- a case that police say forensically links him to another female college student who vanished from the same area as Graham .\n\nJesse Matthew was indicted by a grand jury in Fairfax , Virginia on Monday for the 2005 sexual assault on charges that also included attempted murder and abduction .\n\nAccording to the indictment , Matthew , 32 , `` did feloniously , willfully , deliberately , intentionally and with premeditation attempt to kill -LRB- the victim -RRB- in the commission of or subsequent to an abduction with the intent to defile . ''\n\nThe then-26-year-old victim in that attack was able to provide enough of a detailed description to yield a police sketch , one that years later would link Matthew to the case of

### **Data Cleaning**

In [None]:
class Clean_data:
    def __init__(self):
        pass

    def clean_lines(self, lines):
        cleaned = list()
        table = str.maketrans('', '', string.punctuation)

        for line in lines:
            index = line.find('(CNN)')
            if index >= 0:
                line = line[index + len('(CNN)'):]

            split_line = line.split()

            split_line = [word.lower() for word in split_line]
            split_line = [w.translate(table) for w in split_line]

            split_line = [word for word in split_line if word.isalpha()]
            cleaned.append(' '.join(split_line))
        cleaned = [c for c in cleaned if len(c) > 0]
        return cleaned

In [None]:
obj1 = Clean_data()
cleaned_stories = list()
for example in stories[:100]:
    cleaned_stories.append({
        # Accessing the story (index 0)
        'story': obj1.clean_lines(example[0].split('\n')),
        # Accessing the highlights (index 1)
        'highlights': obj1.clean_lines(example[1])
    })

In [None]:
cleaned_stories[7]

{'story': ['lrb cnn rrb pamela anderson is set to bring a little hollywood glamor to motorsport after becoming part owner of a sports car racing team',
  'the former baywatch star has linked up with the race alliance team which is competing in the inaugural fia gt series',
  'the playboy pinup ventured into motorsport in march when her and occasional racing driver markus fux fronted the european le mans teams',
  'following the failure of that venture fux and anderson have teamed up with the race alliance team for the final three races of the gt series season',
  'we want to make race alliance a recognizable name and have looked for the best drivers possible a spokesman for the team told autosportcom',
  'the plan is to contest the full fia gt series next year as well as the nurburgring hours and then look to nascar after that',
  'in vitantonio liuzzi and mathias lauda the austrian team boasts two drivers of considerable pedigree',
  'inforgraphic deals on wheels',
  'liuzzi spent six

In [None]:
dump(cleaned_stories, open('/content/drive/MyDrive/Colab_Notebook/Internship_Name/Suvidha_Foundation/Folder_Name/CNN_stories_dataset/cnn_dataset.pkl', 'wb'))

In [None]:
cleaned_stories = load(open('/content/drive/MyDrive/Colab_Notebook/Internship_Name/Suvidha_Foundation/Folder_Name/CNN_stories_dataset/cnn_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(cleaned_stories))

Loaded Stories 8


### **Amazon Food Reviews Dataset**

In [None]:
AMAZON_DATA_PATH = '/content/drive/MyDrive/Colab_Notebook/Internship_Name/Suvidha_Foundation/File_Name/Food_review_dataset.csv'

In [None]:
class Load_amazon_data:
    def __init__(self, dir_path, seed = 0):
        self.dir_path = dir_path
        np.random.seed(seed)

    def load(self):
        """
        Reads data from the given directory path
        """
        return pd.read_csv(self.dir_path)

    def drop(self):
        """
        Drops unnecessary columns
        """

        data= self.load()

        data = data.dropna()
        data= data.iloc[:, -2:]
        data = data.reset_index(drop = True)

        return data

    def analyze_data(self):
        """
        Prints some sample data points from the cleaned data
        """
        data= self.drop()

        for sr_no, i in enumerate(np.random.randint(10, 100, size= 5)):
            print("_________________________")
            print("Data Point {0}".format(sr_no + 1))
            print("Summary:")
            print(data['Summary'].iloc[i])
            print("Full Text:")
            print(data['Text'].iloc[i])

In [None]:
obj = Load_amazon_data(AMAZON_DATA_PATH, seed = 1)

### **Load the Data**

In [None]:
data = obj.load()
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### **Removing Unnecessary Columns**

In [None]:
data = obj.drop()
data.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


### **Analyze the data**

In [None]:
obj.analyze_data()

_________________________
Data Point 1
Summary:
Mushy
Full Text:
The flavors are good.  However, I do not see any differce between this and Oaker Oats brand - they are both mushy.
_________________________
Data Point 2
Summary:
Delicious product!
Full Text:
I can remember buying this candy as a kid and the quality hasn't dropped in all these years. Still a superb product you won't be disappointed with.
_________________________
Data Point 3
Summary:
Forget Molecular Gastronomy - this stuff rockes a coffee creamer!
Full Text:
I know the product title says Molecular Gastronomy, but don't let that scare you off.  I have been looking for this for a while now, not for food science, but for something more down to earth.  I use it to make my own coffee creamer.<br /><br />I have to have my coffee blonde and sweet - but the flavored creamers are full of the bad kinds of fat, and honestly, I hate to use manufactured "food" items.  I really don't think they are good for the body.  On the other h

In [None]:
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
class Data_cleaning:
    def __init__(self):
        self.clean_summaries = []
        self.clean_texts = []

    def clean_text(self, text, remove_stopwords = False):
        """
        Defines a series of cleaning operations
        """
        text = text.lower()

        if True:
            text = text.split()
            new_text = []
            for word in text:
                if word in contractions:
                    new_text.append(contractions[word])
                else:
                    new_text.append(word)
            text = " ".join(new_text)

        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text)
        text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'<br >', ' ', text)
        text = re.sub(r'<br  >', ' ', text)
        text = re.sub(r'\'', ' ', text)

        # Optionally, removing stop words
        if remove_stopwords:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)

        return text

    def clean(self, data):
        """
        Applies the clean_text() function to the entire dataset
        """
        for summary in data.Summary:
            self.clean_summaries.append(self.clean_text(summary))

        print("Summaries are complete")

        for text in data.Text:
            self.clean_texts.append(self.clean_text(text))

        print("Texts are complete")

        return self.clean_summaries, self.clean_texts

In [None]:
clean_obj = Data_cleaning()
clean_summaries, clean_texts = clean_obj.clean(data)

Summaries are complete


In [None]:
np.random.seed(1)

for sr_no, i in enumerate(np.random.randint(10, 100, size = 5)):
    print("_________________________")
    print("Data Point #{0}".format(sr_no + 1))
    print("Summary: ")
    print(clean_summaries[i])
    print("Full Text: ")
    print(clean_texts[i])