In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

##### import data

In [None]:
# tsv = tab-separated values
# quoting = 3, treats quotes as normal chars and not delimiters
df = pd.read_csv('../data/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [None]:
df

##### clean texts

In [None]:
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
nltk.download('stopwords')

# lemmatization is better but slower than stemming
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
corpus = []

ps = PorterStemmer()

lz = WordNetLemmatizer()

stop_words = set(stopwords.words('english')) # set has faster lookup than list
stop_words.remove('not')

In [None]:
for review in df['Review']:
    review = contractions.fix(review) # can't -> cannot, won't -> will not

    review = re.sub('[^a-zA-Z]', ' ', review) # non-alphabets to space

    review = review.lower()

    review = review.split() # tokenize

    # review = [ps.stem(word) for word in review if word not in stop_words]

    review = [lz.lemmatize(word) for word in review if word not in stop_words]

    review = ' '.join(review) # untokenize

    corpus.append(review)

In [None]:
print(corpus)

##### create bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# by giving no argument, the total no. of words (len(x[0])) can be known
# then reduce the words to most frequent 'n' words
cv = CountVectorizer(max_features = 1700)

x = cv.fit_transform(corpus).toarray() # convert sparse matrix to nd array

y = df.iloc[:, -1].to_numpy()

In [None]:
len(x[0])

##### split the data

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

##### train svm

In [None]:
from sklearn.svm import SVC

classifier = SVC(kernel = 'linear')

classifier.fit(x_train, y_train)

In [None]:
y_pred = classifier.predict(x_test)

##### evaluate model

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

cm = confusion_matrix(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred) # class balance
f1 = f1_score(y_test, y_pred) # class imbalance

print('confusion matrix: \n', cm)
print('\naccuracy:', accuracy)
print('f1_score:', f1)

___

##### predict on a review

In [None]:
review = "I love this place"

In [None]:
review = contractions.fix(review)

review = re.sub('[^a-zA-Z]', ' ', review)

review = review.lower()

review = review.split()

# review = [ps.stem(word) for word in review if word not in stop_words] 

review = [lz.lemmatize(word) for word in review if word not in stop_words]

review = ' '.join(review)

In [None]:
print(review)

In [None]:
review = cv.transform([review]).toarray()

In [None]:
print(classifier.predict(review))