In [2]:
import nltk
import spacy

# Download WordNet data (if not already downloaded)
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
nlp = spacy.load("en_core_web_sm", disable = ['parser', 'ner'])


import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, Bidirectional
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [7]:
# Preprocessing function
def tokenize_and_lemmatize(df):
    """
    Processes a DataFrame by lemmatizing and tokenizing the 'Text' column
    and returns the DataFrame with an updated column containing the processed text.
    :param df: DataFrame with a 'Text' column.
    :return: DataFrame with a modified 'Text' column containing lemmatized and tokenized text.
    """
    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    stop_words = nlp.Defaults.stop_words  # Default stop words from Spacy

    # Function to preprocess, tokenize, and lemmatize text
    def preprocess(text):
        tokens = text.split()
        # Lemmatize tokens and filter out stopwords
        processed_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]
        # Join tokens back into a single string
        return ' '.join(processed_tokens)

    # Apply the preprocessing and tokenization function to the 'Text' column
    df['Text'] = df['Text'].fillna('').apply(preprocess)

    return df
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/nlp_project/reviews.csv')
df = tokenize_and_lemmatize(df)
df.head()

Mounted at /content/drive


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,bought vitality canned dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,product arrived labeled jumbo salted peanuts.....
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all","confection centuries. light, pillowy citrus ge..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,looking secret ingredient robitussin believe f...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,great taffy great price. wide assortment yummy...


In [9]:
targets_num = df['Score']
inputs_text = df['Text']
len(targets_num) == len(inputs_text)

True

In [10]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=.80, min_df=50)

# Fit and transform the input text to create the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(inputs_text)

In [11]:
# Convert the reshaped array to a list (if needed)
inputs = tfidf_matrix

# Print the shape of the resulting vectors to confirm
print(f"Shape of input vectors: {inputs.shape}")

Shape of input vectors: (568454, 12714)


In [12]:
from sklearn.preprocessing import OneHotEncoder
targets = np.array(targets_num).reshape(-1, 1)
# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the data
targets = encoder.fit_transform(targets)

In [13]:
print(targets[1])

[1. 0. 0. 0. 0.]


In [14]:
x_train, x_test, y_train, y_test = train_test_split(inputs, targets, test_size=.3)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(12714, 1)),
    tf.keras.layers.Conv1D(filters=16, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x_train, y_train, epochs = 6, validation_data = (x_test, y_test))

Epoch 1/6
[1m12435/12435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4957s[0m 398ms/step - accuracy: 0.6896 - loss: 0.8570 - val_accuracy: 0.7386 - val_loss: 0.7163
Epoch 2/6
[1m 9074/12435[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m19:56[0m 356ms/step - accuracy: 0.7446 - loss: 0.6961

In [1]:
model.summary()

NameError: name 'model' is not defined

In [2]:
pd.DataFrame(history.history).plot()

NameError: name 'pd' is not defined

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
y_test_predict = model.predict(x_test)
y_test_predict.shape

In [None]:
ConfusionMatrixDisplay.from_predictions(np.argmax(y_test, axis = -1),
                                       np.argmax(y_test_predict, axis = -1),
                                       normalize = 'true',
                                       values_format = '.0%')