In [8]:
import pandas as pd
import re
import nltk
import contractions

nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


## cleaning function
def utils_preprocess_text(txt, punkt=True, lower=True, slang=True, stopwords=None, stemm=False, lemm=True):
    ### separate sentences with '. '
    txt = re.sub(r'\.(?=[^ \W\d])', '. ', str(txt))
    ### remove punctuations and characters
    txt = re.sub(r'[^A-Za-z0-9\s]+', '', txt) if punkt is True else txt
    ### remove extra spaces
    txt = re.sub(' +', ' ', txt)
    ### remove numeric characters
    txt = re.sub(r'\d+', '', txt)
    ### remove urls
    txt = re.sub(r'http\S+', '', txt)
    ### remove email addresses
    txt = re.sub(r'\S+@\S+', '', txt)
    ### strip
    txt = " ".join([word.strip() for word in txt.split()])
    ### lowercase
    txt = txt.lower() if lower is True else txt
    ### slang
    txt = contractions.fix(txt) if slang is True else txt   
    ### tokenize (convert from string to list)
    lst_txt = txt.split()
    ### stemming (remove -ing, -ly, ...)
    if stemm is True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_txt = [ps.stem(word) for word in lst_txt]
    ### lemmatization (convert the word into root word)
    if lemm is True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_txt = [lem.lemmatize(word) for word in lst_txt]
    ### remove Stopwords
        ### remove stopwords
    if stopwords is not None:
        lst_txt = [word for word in lst_txt if word not in stopwords]
    ### back to string
    txt = " ".join(lst_txt)
    return txt




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:

dtf_train = pd.read_csv("cnn_dailymail/train.csv")


In [11]:
# apply function to both text and summaries
# create stop words
stopwords = []
stopwords = nltk.corpus.stopwords.words("english")
stopwords += ["cnn", "say", "said", "new", "one", "two", "also"]
stopwords += ["just", "like", "get", "make", "time", "even", "much", "many", "way", "thing", "need", "take", "well", "could", "would", "should", "might", "must", "also", "however", "yet", "still", "rather", "either", "neither", "whether", "meanwhile"]

dtf_train["text_clean"] = dtf_train["article"].apply(utils_preprocess_text, stopwords=stopwords, stemm=False, slang=False, lemm=False)
dtf_train["y_clean"] = dtf_train["highlights"].apply(utils_preprocess_text, stopwords=stopwords, stemm=False, slang=False, lemm=False)

In [13]:
dtf_train["text_clean"][0]

'associated press published est october updated est october bishop fargo catholic diocese north dakota exposed potentially hundreds church members fargo grand forks jamestown hepatitis virus late september early october state health department issued advisory exposure anyone attended five churches took communion bishop john folda pictured fargo catholic diocese north dakota exposed potentially hundreds church members fargo grand forks jamestown hepatitis state immunization program manager molly howell says risk low officials feel important alert people possible exposure diocese announced monday bishop john folda taking diagnosed hepatitis diocese says contracted infection contaminated food attending conference newly ordained bishops italy last month symptoms hepatitis include fever tiredness loss appetite nausea abdominal discomfort fargo catholic diocese north dakota pictured bishop located'

In [14]:
dtf_train["article"][0]

"By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained 

### Padding is done and a simple encoder decoder model is given to see the model.

In [15]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Tokenize the cleaned text
max_features = 10000
tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
tokenizer.fit_on_texts(dtf_train["text_clean"].values)

# Create padded sequences
maxlen = 500
X = pad_sequences(tokenizer.texts_to_sequences(dtf_train["text_clean"].values), maxlen=maxlen)
y = pad_sequences(tokenizer.texts_to_sequences(dtf_train["y_clean"].values), maxlen=maxlen)

# Define the encoder-decoder model
latent_dim = 256



In [16]:
# Encoder
encoder_inputs = Input(shape=(maxlen,))
x = Embedding(max_features, latent_dim)(encoder_inputs)
x, state_h, state_c = LSTM(latent_dim, return_state=True)(x)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(maxlen,))
x = Embedding(max_features, latent_dim)(decoder_inputs)
x = LSTM(latent_dim, return_sequences=True)(x, initial_state=encoder_states)
decoder_outputs = Dense(max_features, activation="softmax")(x)

# Define the model and compile it
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy")



In [None]:
# Train the model
model.fit([X, y], y, batch_size=64, epochs=10, validation_split=0.2)
