<a href="https://colab.research.google.com/github/arzoozehra/CIND820/blob/main/unsupervised_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import libraries**

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
#!pip install contractions
import contractions
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
#!pip install pyspellchecker
#from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.losses import SparseCategoricalCrossentropy
from tensorflow.python.keras.optimizer_v2.adam import Adam

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


**Load data**

In [10]:
url = "https://raw.githubusercontent.com/arzoozehra/CIND820/main/data/train.csv"
train = pd.read_csv(url)
test = pd.read_csv("https://raw.githubusercontent.com/arzoozehra/CIND820/main/data/test.csv")

train.drop(["textID", "selected_text"], axis=1, inplace=True)
test.drop(["textID"], axis=1, inplace=True)

# Remove row with missing values
train.dropna(inplace=True)

**Clean data**

In [11]:
def clean_data(data):
  
  # Convert text to lowercase
  data["text"] = data["text"].str.lower()

  # Expand contractions e.g "gonna" to "going to" and "i've" to "i have"
  data["text"].replace( {r"`": "'"}, inplace= True, regex = True)
  data["text"] = data["text"].apply(contractions.fix)

  # Remove @, Unicode characters, punctuation, emojis, URLs, retweets, words with digits, and 1 or 2 letter words
  data["text"].replace( {r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?|\w*\d\w*|\b\w{1,2}\b": " "}, inplace= True, regex = True)

  # Remove extra whitespaces
  data["text"].replace( {r" +": " "}, inplace= True, regex = True)
  data["text"] = data["text"].str.strip()

  # Correct spellings
  #spell = SpellChecker()

  #def correct_spellings(text):
  #    corrected_text = []
  #    misspelled_words = {}
  #    words = text.split()
  #    for w in spell.unknown(words):
  #        corr = spell.correction(w)
  #        if corr:
  #            misspelled_words[w] = spell.correction(w) or w
  #    corrected_text = [misspelled_words.get(w, w) for w in words]
  #    return " ".join(corrected_text)

  #data["text"] = data["text"].apply(lambda x : correct_spellings(x))

  # Remove stopwords
  stop = stopwords.words("english")
  data["text"] = data["text"].apply(lambda text: " ".join([word for word in text.split() if word not in (stop)]))

  # Stemming
  stemmer = PorterStemmer()
  data["text"] = data["text"].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split()]))

  # Lemmatizing
  lemmatizer = WordNetLemmatizer()
  data["text"] = data["text"].apply(lambda text: " ".join([lemmatizer.lemmatize(word) for word in text.split()]))

  return data

In [12]:
#Clean trraining data
train = clean_data(train)

#Clean testing data
test = clean_data(test)


**Feature Selection**

In [31]:
# Vectorization parameters

def ngram_vectorize(train_texts, train_labels, val_texts):
  """Vectorizes texts as ngram vectors.
  1 text = 1 tf-idf vector the length of vocabulary of uni-grams + bi-grams.
  # Arguments
      train_texts: list, training text strings.
      train_labels: np.ndarray, training labels.
      val_texts: list, validation text strings.
  # Returns
      x_train, x_val: vectorized training and validation texts
  """
  # Range (inclusive) of n-gram sizes for tokenizing text.
  # Use 1-grams + 2-grams.
  NGRAM_RANGE = (1, 2)

  # Whether text should be split into word or character n-grams.
  # Split text into word tokens.
  TOKEN_MODE = 'word'

  # Minimum document/corpus frequency below which a token will be discarded.
  MIN_DOCUMENT_FREQUENCY = 5

  # Limit on the number of features. We use the top 20K features.
  TOP_K = 20000

  # Create keyword arguments to pass to the 'tf-idf' vectorizer.
  kwargs = {
          "ngram_range": NGRAM_RANGE,
          "analyzer": TOKEN_MODE,  
          "min_df": MIN_DOCUMENT_FREQUENCY,
          "max_df" : 0.8,
          "sublinear_tf": "True"
  }
  vectorizer = TfidfVectorizer(**kwargs)

  # Learn vocabulary from training texts and vectorize training texts.
  x_train = vectorizer.fit_transform(train_texts).toarray()

  # Vectorize validation texts.
  x_val = vectorizer.transform(val_texts).toarray()

  # # Select top 'k' of the vectorized features.
  # selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
  # selector.fit(x_train, train_labels)
  # x_train = selector.transform(x_train)
  # x_val = selector.transform(x_val)

  # x_train = x_train.toarray()
  # x_val = x_val.toarray()
  return x_train, x_val

In [22]:
print(np.shape(x_train))
print(np.shape(x_val))
# print(type(train_vectors))
# print(type(test_vectors))
# print(train_vectors)
# print(test_vectors)

(27480, 5713)
(3534, 5713)


**Unsupervised modelling using TensorFlow**

In [17]:
def mlp_model(layers, units, dropout_rate, input_shape, op_units=3, op_activation='softmax'):
    """Creates an instance of a multi-layer perceptron model.

    # Arguments
        layers: int, number of `Dense` layers in the model.
        units: int, output dimension of the layers.
        dropout_rate: float, percentage of input to drop at Dropout layers.
        input_shape: tuple, shape of input to the model.
        op_units: int, number of output classes.
        op_activation: softmax for multiclass

    # Returns
        An MLP model instance.
    """

    model = models.Sequential()
    model.add(Dropout(rate=dropout_rate, input_shape=input_shape))

    for _ in range(layers-1):
        model.add(Dense(units=units, activation='relu'))
        model.add(Dropout(rate=dropout_rate))

    model.add(Dense(units=op_units, activation=op_activation))
    return model

**Train model**

In [None]:
learning_rate=1e-3
epochs=1000
batch_size=128
layers=2
units=64
dropout_rate=0.2

"""Trains n-gram model on the given dataset.

# Arguments
    train, test: tuples of training and test texts and labels.
    learning_rate: float, learning rate for training model.
    epochs: int, number of epochs.
    batch_size: int, number of samples per batch.
    layers: int, number of `Dense` layers in the model.
    units: int, output dimension of Dense layers in the model.
    dropout_rate: float: percentage of input to drop at Dropout layers.
"""
# Encode train and test labels
le = LabelEncoder()
train_labels = le.fit_transform(train["sentiment"])
test_labels = le.fit_transform(test["sentiment"])

# Vectorize texts.
x_train, x_val = ngram_vectorize(train["text"], train_labels, test["text"])

# Create model instance.
model = mlp_model(layers=layers,
                  units=units,
                  dropout_rate=dropout_rate,
                  input_shape=x_train.shape[1:])

# Compile model with learning parameters.
loss = 'sparse_categorical_crossentropy'
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

# Create callback for early stopping on validation loss. If the loss does
# not decrease in two consecutive tries, stop training.
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]

# Train and validate model.
history = model.fit(
        x_train,
        train_labels,
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val, test_labels),
        verbose=2,  # Logs once per epoch.
        batch_size=batch_size
)

# Print results.
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

# Save model.
model.save('imdb_mlp_model.h5')
print(history['val_acc'][-1], history['val_loss'][-1])

Epoch 1/1000
215/215 - 6s - loss: 0.9643 - acc: 0.5455




Epoch 2/1000
215/215 - 4s - loss: 0.7572 - acc: 0.6857




Epoch 3/1000
215/215 - 5s - loss: 0.6891 - acc: 0.7157




Epoch 4/1000
215/215 - 4s - loss: 0.6589 - acc: 0.7283




Epoch 5/1000
215/215 - 4s - loss: 0.6370 - acc: 0.7379




Epoch 6/1000
215/215 - 4s - loss: 0.6254 - acc: 0.7440




Epoch 7/1000
215/215 - 7s - loss: 0.6041 - acc: 0.7525




Epoch 8/1000
215/215 - 4s - loss: 0.5938 - acc: 0.7559




Epoch 9/1000
215/215 - 4s - loss: 0.5834 - acc: 0.7624




Epoch 10/1000


In [None]:
# def train_ngram_model(train, test,
#                       learning_rate=1e-3,
#                       epochs=100,
#                       batch_size=128,
#                       layers=2,
#                       units=64,
#                       dropout_rate=0.2):
#     """Trains n-gram model on the given dataset.

#     # Arguments
#         train, test: tuples of training and test texts and labels.
#         learning_rate: float, learning rate for training model.
#         epochs: int, number of epochs.
#         batch_size: int, number of samples per batch.
#         layers: int, number of `Dense` layers in the model.
#         units: int, output dimension of Dense layers in the model.
#         dropout_rate: float: percentage of input to drop at Dropout layers.
#     """


#     #(train_texts, train_labels), (val_texts, val_labels) = data

#     # Vectorize texts.
#     x_train, x_val = ngram_vectorize(train["text"], train["sentiment"], test["text"])
    
#     # Create model instance.
#     model = mlp_model(layers=layers,
#                       units=units,
#                       dropout_rate=dropout_rate,
#                       input_shape=x_train.shape[1:])

#     # Compile model with learning parameters.
#     loss = 'sparse_categorical_crossentropy'
#     optimizer = Adam(lr=learning_rate)
#     model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

#     # Create callback for early stopping on validation loss. If the loss does
#     # not decrease in two consecutive tries, stop training.
#     callbacks = [EarlyStopping(monitor='val_loss', patience=2)]

#     # Train and validate model.
#     history = model.fit(
#             x_train,
#             train["sentiment"],
#             epochs=epochs,
#             callbacks=callbacks,
#             validation_data=(x_val, test["sentiment"]),
#             verbose=2,  # Logs once per epoch.
#             batch_size=batch_size)

#     # Print results.
#     history = history.history
#     print('Validation accuracy: {acc}, loss: {loss}'.format(
#             acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

#     # Save model.
#     model.save('imdb_mlp_model.h5')
#     return history['val_acc'][-1], history['val_loss'][-1]

In [None]:
#train_ngram_model(train, test)

In [None]:
#model.get_config()

In [None]:
# from tensorflow.keras.utils import plot_model
# plot(model, to_file='tfNN_model.png')