# <center>Init</center>

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, WordPunctTokenizer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import pickle

---
# <center>Prep</center>

In [2]:
def prep():
  amazon = pd.read_csv('./files/amazon_cells_labelled.txt', sep='\t', encoding='ISO-8859-1', on_bad_lines='skip', names=['Review', 'Label'])
  imdb = pd.read_csv('./files/imdb_labelled.txt', sep='\t', encoding='ISO-8859-1', on_bad_lines='skip', names=['Review', 'Label'])
  yelp = pd.read_csv('./files/yelp_labelled.txt', sep='\t', encoding='ISO-8859-1', on_bad_lines='skip', names=['Review', 'Label'])
  reviews_df = pd.concat([amazon, imdb, yelp], axis=0, ignore_index=True)

  stop_words = stopwords.words('english')
  stop_words.extend(['.', ',', "'", '"', '?', '!', '-', '/', ':', '(', ')', '\n', '@'])
  to_remove = ["but", "or", "against", "on", "off", "both", "no", "nor", "not", "only", "same", "don'", "don't", "ain'", "aren'", "aren't", "could'", "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'wouldn', "wouldn't", 'ain', 'aren', 'couldn']
  stop_words = [word for word in stop_words if word not in to_remove]
  sent_list = reviews_df['Review'].apply(sent_tokenize).tolist()
  reviews_list = [' '.join(inner_list) for inner_list in sent_list]

  return reviews_df, stop_words, reviews_list

---
# <center>Preprocessing</center>

In [3]:
def preprocess_reviews(stop_words, reviews_list):
    filtered_reviews = []
    for sentence in reviews_list:
        words = WordPunctTokenizer().tokenize(sentence)
        filtered_words = []
        
        for word in words:
            word_lower = word.lower()
            if word_lower not in stop_words:
                pos = pos_tag([word])[0][1]
                
                if word_lower.endswith('ness'):  # Nouns
                    stemmer = LancasterStemmer()
                    filtered_word = stemmer.stem(word_lower)
                elif pos.startswith('VB'):  # Verbs
                    lemmatizer = WordNetLemmatizer()
                    filtered_word = lemmatizer.lemmatize(word_lower, pos='v')
                else:
                    filtered_word = word_lower
                
                filtered_words.append(filtered_word)
        
        filtered_sentence = " ".join(filtered_words)
        filtered_reviews.append(filtered_sentence)
    return filtered_reviews

---
# <center>Vectorize</center>

In [4]:
def vectorize(reviews):
  vectorizer = TfidfVectorizer()
  f_vectors = vectorizer.fit(reviews)
  vectorized_reviews = f_vectors.transform(reviews)
  return pd.DataFrame(vectorized_reviews.toarray(), columns=f_vectors.get_feature_names_out())

---
# <center>Define | Split X and Y</center>

In [5]:
def define_split_x_y(vectorized_reviews, labels):
    vectorized_reviews['labels'] = labels

    x = vectorized_reviews.drop(['labels'], axis=1)
    y = vectorized_reviews['labels']

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

    return x_train, x_test, y_train, y_test

---
# <center>Train | Predict</center>

In [6]:
def train_predict(x_train, x_test, y_train):
  xgb_c = xgb.XGBClassifier()
  xgb_model = xgb_c.fit(X=x_train, y=y_train)
  x_test_pred = xgb_model.predict(x_test)
  return xgb_model, x_test_pred

---
# <center>Pickle</center>

In [7]:
def pickles(stop_words, xgb_model):
  pickle.dump(stop_words, open('./files/stop_words.pickle', 'wb'))
  pickle.dump(xgb_model, open('./files/xgb_model.pickle', 'wb'))

---
# <center>Evaluate</center>

In [8]:
def evaluate(y_test, x_test_pred):
  return roc_auc_score(y_test, x_test_pred)

---
# <center>FN Call</center>

In [9]:
reviews_df, stop_words, reviews_list = prep()

In [10]:
filtered_reviews = preprocess_reviews(stop_words, reviews_list)

In [11]:
vectorized_reviews = vectorize(filtered_reviews)

In [12]:
x_train, x_test, y_train, y_test = define_split_x_y(vectorized_reviews, reviews_df['Label'])

In [13]:
xgb_model, x_test_pred = train_predict(x_train, x_test, y_train)

In [14]:
pickles(stop_words, xgb_model)

In [15]:
eval_score = evaluate(y_test, x_test_pred)

In [16]:
eval_score

0.7907927976806091