In [20]:
# Import required modules
# pandas is a module to work with dataframes
import pandas as pd

# configparser is a module to work with configuration files
import configparser

In [21]:
# config is an object of RawConfigParser class
config = configparser.RawConfigParser()

# Read the configuration file
config.read("app.properties")

['app.properties']

In [22]:
# csv_file_path is a variable that contains the path of the CSV file that the program will create
# to store the result of scraped data
#
# To update the path of the CSV file path, update the value of csv.path in app.properties file
# specificly in the csv.path at CSVSection section
csv_file_path: str = config.get("CSVSection", "csv.path")

In [23]:
# Read the CSV file and store the data in a dataframe
df = pd.read_csv(csv_file_path)

# Print the csv file as a dataframe
df

Unnamed: 0,Review,Label
0,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t2 10 dolla...,negative
1,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tNow with t...,negative
2,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAt least y...,positive
3,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tFinally I ...,positive
4,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tFinally he...,positive
...,...,...
875,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tAs of 3/18...,negative
876,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tThis game ...,negative
877,\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tadd hitler...,negative
878,\n\nProduk diterima secara gratis\n\t\t\t\t\t\...,negative


In [24]:
# re is a library that allows to work with regular expressions
import re

# emoji is a library that allows to work with emojis
import emoji

# string is a library that allows to work with strings
import string

# nltk.tokenize is a library that allows to tokenize the text
# word_tokenize is a function that tokenizes the text into words
from nltk.tokenize import word_tokenize

# nltk.corpus is a library that allows to work with the corpus
# stopwords is a list of common words that are not useful for analysis
from nltk.corpus import stopwords

# nltk is a library that allows to work with natural language processing
import nltk

In [25]:
# Downloads the required data for natural language processing
nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\bryan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bryan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
def tokenize(reviews) -> pd.DataFrame:
    """
    tokenize is a function that tokenizes the text into words

    Parameters
    ----------
    - df: The data that the function will tokenize

    Returns
    -------
    The tokenized data
    """

    # Tokenize the word
    tokens = word_tokenize(reviews)

    # Remove stop words
    stop_words = set(stopwords.words("english"))

    # Remove the stop words from the tokens
    tokens_no_stopwords = [word for word in tokens if word not in stop_words]

    return tokens_no_stopwords

In [27]:
# nltk.stem is a library that allows to work with stemming
# WordNetLemmaTizer is a class that lemmatizes the words
from nltk.stem import WordNetLemmatizer

In [28]:
def lemmatizing(reviews):
    """
    lemmatizing is a function to lemmatize the words

    Parameters
    ----------
    - df: The data that the function will lemmatize

    Returns
    -------
    The lemmatized data
    """

    # factory is a variable that contains the factory to create a lemmatizer
    factory = WordNetLemmatizer()

    # Lemmatize the words
    return [factory.lemmatize(word) for word in reviews]

In [29]:
def pre_process(df: pd.DataFrame) -> pd.DataFrame:
    """'
    pre_process is a function that preprocesses the data before saving it to a CSV file

    Parameters
    ----------
    - df: The data that the function will preprocess

    Returns
    -------
    The preprocessed data
    """

    # Convert the "Review" column to lowercase and remove the leading and trailing whitespaces
    df["Review"] = df["Review"].apply(lambda x: x.lower().strip())

    # Remove the extra whitespaces from the "Review" column
    df["Review"] = df["Review"].apply(lambda x: re.sub("\s+", " ", x))

    # Remove the emojis from the "Review" column
    df["Review"] = df["Review"].apply(lambda x: emoji.replace_emoji(x, replace=""))

    # Remove the punctuation from the "Review" column
    df["Review"] = df["Review"].apply(
        lambda x: x.translate(str.maketrans("", "", string.punctuation))
    )

    # Remove the numbers from the "Review" column
    df["Review"] = df["Review"].apply(lambda x: re.sub(r"\d+", "", x))

    # Call the tokenize function to tokenzie the words
    df["Review"] = df["Review"].apply(lambda x: tokenize(x))

    # Call the lemmatizing function to lemmatize the words
    df["Review"] = df["Review"].apply(lambda x: lemmatizing(x))

    # Join the words in the "Review" column
    df["Review"] = df["Review"].apply(lambda x: " ".join(x))
    
    # Remove the rows where the "Review" column is empty
    df = df[df["Review"] != ""]

    return df

In [30]:
# Call the pre_process function to preprocess the data
df = pre_process(df=df)

# Print the result of the preprocessing process
df

Unnamed: 0,Review,Label
0,dollar battle pass per seasontitanfall died slop,negative
1,new battle pas system im deleting game well pl...,negative
2,least dont build apartment complex win apex,positive
3,finally remove origin,positive
4,finally hereit launch without originthank u re...,positive
...,...,...
875,rce remote code execution exploityou prone get...,negative
876,game make want nothing cry,negative
877,add hitler good,negative
878,produk diterima secara gratis super trash woul...,negative


In [31]:
# sklearn is a module to work with machine learning models
# train_test_split is a function to split the data into training and testing sets
from sklearn.model_selection import train_test_split

# CountVectorizer is a function to convert text data into numerical data
from sklearn.feature_extraction.text import CountVectorizer

# SVG is a class to create a Support Vector Machine model
from sklearn.svm import SVC

# accuracy_score is a function to calculate the accuracy of the model
# classification_report is a function to generate a classification report
from sklearn.metrics import accuracy_score, classification_report

In [32]:
# Create a CountVectorizer object
vectorizer = CountVectorizer(max_features=2500)

In [33]:
# x is a variable that contains the vectorized data
x = vectorizer.fit_transform(df["Review"]).toarray()

# y is a variable that contains the labels or rather the result
y = df["Label"]

In [34]:
# Split the data into training and testing sets
# x_train is the input of training data
# x_test is the input of testing data
# y_train is the results of the training labels
# y_test is the results of the testing labels
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [35]:
# Print the number of training and testing data
print(f"Training data: {len(x_train)}")
print(f"Testing data: {len(x_test)}")

Training data: 700
Testing data: 175


In [36]:
# Create the support vector classifier model
model = SVC()

# Train the model using the training data
model.fit(x_train, y_train)

In [37]:
# y_predictions is a variable that contains the predictions of the model
# based on the testing data
y_predictions = model.predict(x_test)


# Calculate the accuracy of the model

accuracy = accuracy_score(y_test, y_predictions)

In [38]:
# Print the model accuracy
print(f"Accuracy: {accuracy}")

# Print the accuracy of the model
print(f"Clasification report: {classification_report(y_test, y_predictions)}")

Accuracy: 0.6628571428571428
Clasification report:               precision    recall  f1-score   support

    negative       0.78      0.38      0.51        81
    positive       0.63      0.90      0.74        94

    accuracy                           0.66       175
   macro avg       0.70      0.64      0.63       175
weighted avg       0.70      0.66      0.64       175

