## Initialize Notebook

In [None]:
!pip install -q -r requirements.txt

## Reading the dataset

In this code-cell, use Amazon CodeWhisperer to load the AMAZON-REVIEW-DATA-CLASSIFICATION.csv to a pandas dataframe and print the shape

In [None]:
# Read dataset from AMAZON-REVIEW-DATA-CLASSIFICATION.csv using pandas library & print the first 10 rows
import pandas as pd

df = pd.read_csv('AMAZON-REVIEW-DATA-CLASSIFICATION.csv')

print(df.head(10))

## Data Exploration

In this code-cell, use Amazon CodeWhisperer to print the distribution of the isPositive field.

In [None]:
# Count the total number of isPositive in df
print(df['isPositive'].value_counts())

In this code-cell, use Amazon CodeWhisperer to print the missing values for each column

In [None]:
# Print the total number of missing values for each columm of the dataset
print(df.isna().sum())

## Data Transformation

In this code cell, we present a skeletal structure that you need to complete for typical preprocessing tasks.

In [None]:
from nltk.corpus import stopwords
import re
import nltk

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Get a list of stop words from the NLTK library and saved them in stop variable
stop = stopwords.words('english')

# Variable with list of words which provide relevant information in sentiment analysis.
relevant_words =['against', 'not', 'don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# Remove relevant words from stop
stop = [word for word in stop if word not in relevant_words]

# Function called process_text that takes a list of text sentences (texts) as input
def process_text(texts):
    # Create a list to store the processed text
    processed_texts = []

    # Loop through each text in the list
    for text in texts:

        # Check if the text is null and replace it with empty string
        if text is None:
            text = ''

        # Convert non-string data type to a string
        if not isinstance(text, str):
            text = str(text)

        # Convert the text to lowercase
        text = text.lower()

        # Remove white spaces
        text = text.strip()

        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)

        # Remove html tags/markups
        text = re.sub(r'<.*?>', '', text)

        # Tokenize the text using nlkt
        tokens = nltk.word_tokenize(text)

        # New filtered sentence variable
        filtered_sentence = []

        # Loop through each token in the tokenized text
        for token in tokens:
            # Check if the token is not stop word & lemmatize it
            if token not in stop:
                token = nltk.WordNetLemmatizer().lemmatize(token)
                filtered_sentence.append(token)

        # Join the filtered sentence into a single string
        filtered_sentence = ' '.join(filtered_sentence)
        
        # Append the filtered sentence to the processed text list
        processed_texts.append(filtered_sentence)

    # Return the processed text list
    return processed_texts

## Train & Validation Split

In this code-cell, use Amazon CodeWhisperer to split our dataset into training (90%) and validation (10%) by making use of the suggested comment below..

In [14]:
# Split our dataset into training (90%) and validation (10%)
# Pass df[["reviewText"]] as features
# Pass df["isPositive"] as labels
# Random State 324
# Use train_test_split library
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[["reviewText"]], df["isPositive"], test_size=0.1, random_state=324)

In this code-cell, use Amazon CodeWhisperer to process X_train & X_test & print the first processed reviews.

In [None]:
# Process the reviewText column in X_train & X_test
X_train_processed = process_text(X_train['reviewText'])
X_test_processed = process_text(X_test['reviewText'])

# Print the first 5 processed reviews
print(X_train_processed[:5])
print(X_test_processed[:5])

# Data Processing with Pipeline

In this code-cell, use Amazon CodeWhisperer to create a pipeline that extracts 50 features from text with binary values and uses K-NN to classify documents 

In [None]:
# Create a pipeline with CountVectorizer and K-NNClassifier, where CountVectorizer will return binary values and use 50 max features
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(binary=True, max_features=50)),
    ('knn', KNeighborsClassifier())
])

# Visualize the pipeline
from sklearn import set_config
set_config(display='diagram')
pipeline

# Train the Classifier

In this code-cell, use Amazon CodeWhisperer to train the classifier

In [None]:
# Fit the pipeline to the training data
pipeline.fit(X_train_processed, y_train)

# Test the Classifier

In this code-cell, use Amazon CodeWhisperer to test the classifier

In [None]:
# Use the fitted pipeline to make predictions on the validation dataset
# Evaluate the model using the following tools: confusion_matrix, classification_report, and accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_pred = pipeline.predict(X_test_processed)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)