In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk import ngrams
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Step 1: Load the data
data = pd.read_csv('../data/google_play_store_apps_reviews.csv')

# Step 2: Split the data
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [114]:
# Step 3: Build the n-gram Language Model
def get_ngrams(text, n):
    tokens = nltk.word_tokenize(text)
    return list(ngrams(tokens, n))

def train_ngram(data, n):
    positive_ngrams = []
    negative_ngrams = []

    for _, row in data.iterrows():
        grams = get_ngrams(row['review'], n)
        if row['polarity'] == 1:
            positive_ngrams.extend(grams)
        elif row['polarity'] == 0:
            negative_ngrams.extend(grams)

    positive_freq = FreqDist(positive_ngrams)
    negative_freq = FreqDist(negative_ngrams)

    return positive_freq, negative_freq

n = 2
positive_freq, negative_freq = train_ngram(train_data, n)
print(positive_freq.most_common(10))
print(negative_freq.most_common(10))

[(('!', '!'), 108), (('.', 'i'), 48), (('it', "'s"), 45), (('this', 'app'), 35), (('it', 'is'), 33), ((',', 'but'), 27), (('i', 'love'), 25), (('of', 'the'), 25), (('.', 'it'), 21), (('this', 'game'), 21)]
[(('.', 'i'), 124), (('ca', "n't"), 91), (('!', '!'), 84), (('i', 'have'), 72), (('does', "n't"), 67), (('do', "n't"), 66), (('the', 'app'), 66), (('it', "'s"), 64), (('.', 'it'), 56), ((',', 'i'), 47)]


In [115]:
import math
from collections import Counter

def test_ngram(data, positive_freq, negative_freq, n):
    pred_labels = []
    vocab_size = len(set(positive_freq.keys()).union(set(negative_freq.keys())))
    log_pos_denominator = math.log(sum(positive_freq.values()) + vocab_size)
    log_neg_denominator = math.log(sum(negative_freq.values()) + vocab_size)

    for _, row in data.iterrows():
        
        grams = get_ngrams(row['review'], n)
        log_positive_prob = 0
        log_negative_prob = 0

        review_counter = Counter(grams)
        for gram, _ in review_counter.items():
            if gram in positive_freq or gram in negative_freq:
                log_positive_prob += math.log(positive_freq.get(gram, 0) + 1) - log_pos_denominator
                log_negative_prob += math.log(negative_freq.get(gram, 0) + 1) - log_neg_denominator
        
        if log_positive_prob >= log_negative_prob:
            pred_labels.append(1)
        else:
            pred_labels.append(0)

    return pred_labels

In [117]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

pred_labels = test_ngram(test_data, positive_freq, negative_freq, n)

true_labels = test_data['polarity'].tolist()

accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)
cm = confusion_matrix(true_labels, pred_labels)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.8547486033519553
Precision: 0.8461538461538461
Recall: 0.6226415094339622
F1 Score: 0.717391304347826
