# Import packages

In [1]:
import numpy as np  # Work with multi-dimenional data
import pandas as pd # Work with relational data
import matplotlib.pyplot as plt # Visualize data
import seaborn as sns # Visualize data base on matplotlib
import nltk
import pickle
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from scipy import sparse
from preset_function import *

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aupho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Dataset

This is the dataset we used in model selection. In this notebook, we will use all the dataset to train the model

In [2]:
directory = 'data/dataset/raw/'

# This method use to extract all the file in the input list
def extract_data(files):
  data_x_raw = list()
  data_y_raw = list()
  for file in files:
    with open(directory + file) as f:
      for line in f:
        line = line.strip('\n')
        x_raw, y_raw = line.split(sep=';')

        data_x_raw.append(x_raw)
        data_y_raw.append(y_raw)

  return data_x_raw, data_y_raw

In [3]:
X_train, y_train = extract_data(['train.txt', 'val.txt', 'test.txt'])

# Necessary preprocess

In [4]:
file = open("data/stopwords/stop_words_english.txt", 'r', encoding='utf-8')
stopword_list = file.read().split('\n')
file.close()
print(len(stopword_list))

850


In [5]:
file = open("data/stopwords/stop_words_english.txt", 'r', encoding='utf-8')
stop_words = file.read().split('\n')

import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')
tmp = stopwords.words('english')

unfiltered_stopwords = stop_words + tmp

print(len(unfiltered_stopwords))

1029


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aupho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
stemmer = PorterStemmer()

def unfiltered_tokenize(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in unfiltered_stopwords]
    stems = [stemmer.stem(token) for token in tokens]
    return stems

# Train and export

This section will split into 4 subsection since we have 4 ways to process the data

## Bag of words

First we recreate the count vectorizer from the origin dataset. Then, we vectorize 2 dataset with that vectorizer

In [7]:
count_vector = CountVectorizer(stop_words=unfiltered_stopwords, tokenizer=unfiltered_tokenize, ngram_range=(1, 2))
directory = "data/models/BoW/"

### kNN

In [8]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', KNeighborsClassifier(n_neighbors = 2, p = 1, weights = 'distance'))
])
pipeline.fit(X_train, y_train)
with open(directory + "kNN.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Naive Bayes

In [9]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', MultinomialNB(alpha=0.5))
])
pipeline.fit(X_train, y_train)
with open(directory + "NB.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Decision Tree

In [10]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', DecisionTreeClassifier(max_depth=2500, min_samples_leaf=2, min_samples_split=1000, max_leaf_nodes=310))
])
pipeline.fit(X_train, y_train)
with open(directory + "DT.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Support vector machine

In [11]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', SVC(C=50.0, gamma=0.001291549665014884))
])
pipeline.fit(X_train, y_train)
with open(directory + "SVM.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Logistic Regression (OvR)

In [12]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', LogisticRegression(C=3.1622776601683795, l1_ratio=0.9, multi_class='ovr', penalty='elasticnet', solver='saga'))
])
pipeline.fit(X_train, y_train)
with open(directory + "LR.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Random forest

In [13]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', RandomForestClassifier(max_depth=1000, max_features=200, min_samples_split=25, n_estimators=512))
])
pipeline.fit(X_train, y_train)
with open(directory + "RF.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Softmax regression

In [14]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', LogisticRegression(C=1, l1_ratio=0.9, multi_class='multinomial', penalty='elasticnet', solver='saga'))
])
pipeline.fit(X_train, y_train)
with open(directory + "SR.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

## BoW/TF-IDF

In [15]:
tfidf_vector = TfidfVectorizer(stop_words=unfiltered_stopwords, tokenizer=unfiltered_tokenize, ngram_range=(1, 2))
directory = 'data/models/TF-IDF/'

### kNN

In [16]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', KNeighborsClassifier(n_neighbors = 24, p = 2, weights = 'uniform'))
])
pipeline.fit(X_train, y_train)
with open(directory + "kNN.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Naive Bayes

In [17]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', MultinomialNB(alpha=0.1))
])
pipeline.fit(X_train, y_train)
with open(directory + "NB.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Decision Tree

In [18]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', DecisionTreeClassifier(max_depth=2000, min_samples_leaf=3,
                                  min_samples_split=500, max_leaf_nodes=250))
])
pipeline.fit(X_train, y_train)
with open(directory + "DT.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Support vector machine

In [19]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', SVC(kernel='linear', C=1.25))
])
pipeline.fit(X_train, y_train)
with open(directory + "SVM.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Logistic Regression (OvR)

In [20]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', LogisticRegression(C=25.118864315095795, l1_ratio=0.9, multi_class='ovr',
                              penalty='elasticnet', solver='saga'))
])
pipeline.fit(X_train, y_train)
with open(directory + "LR.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Random forest

In [21]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', RandomForestClassifier(max_depth=5000, max_features=300, min_samples_split=25, n_estimators=256))
])
pipeline.fit(X_train, y_train)
with open(directory + "RF.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Softmax regression

In [22]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', LogisticRegression(C=3.1622776601683795, l1_ratio=0.9, multi_class='multinomial', penalty='elasticnet', solver='saga'))
])
pipeline.fit(X_train, y_train)
with open(directory + "SR.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

## Bag of words (L1 regularization removal)

Vectorize in the new way:

In [23]:
file = open("data/stopwords/useless.txt", 'r', encoding='utf-8')
useless = file.read().split()

L1_stopwords = unfiltered_stopwords + useless
print("Length of new stopwords list:", len(L1_stopwords))

def useless_tokenize(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in L1_stopwords]
    stems = [stemmer.stem(token) for token in tokens]
    return stems

count_vector = CountVectorizer(stop_words=L1_stopwords, tokenizer=useless_tokenize, ngram_range=(1, 2))
directory = 'data/models/BoW L1/'

Length of new stopwords list: 10622


### kNN

In [24]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', KNeighborsClassifier(n_neighbors = 3, p = 1, weights = 'uniform'))
])
pipeline.fit(X_train, y_train)
with open(directory + "kNN.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Naive Bayes

In [25]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', MultinomialNB(alpha=0.3))
])
pipeline.fit(X_train, y_train)
with open(directory + "NB.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Decision Tree

In [26]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', DecisionTreeClassifier(max_depth=300, min_samples_split=1000, max_leaf_nodes=340))
])
pipeline.fit(X_train, y_train)
with open(directory + "DT.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Support vector machine

In [27]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', SVC(C=60.0, gamma=0.0016681005372000592))
])
pipeline.fit(X_train, y_train)
with open(directory + "SVM.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Logistic Regression (OvR)

In [28]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', LogisticRegression(C=3.1622776601683795, l1_ratio=0.5, 
                              multi_class='ovr',penalty='elasticnet', solver='saga'))
])
pipeline.fit(X_train, y_train)
with open(directory + "LR.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Random forest

In [29]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', RandomForestClassifier(max_depth=1000, max_features=100, 
                                  min_samples_split=25, n_estimators=256))
])
pipeline.fit(X_train, y_train)
with open(directory + "RF.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Softmax regression

In [30]:
pipeline = Pipeline([
    ('vectorizer', count_vector),
    ('clf', LogisticRegression(C=1, penalty='l1', solver='saga', multi_class='multinomial'))
])
pipeline.fit(X_train, y_train)
with open(directory + "SR.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

## BoW/TF-IDF (L1 regularization removal)

In [31]:
tfidf_vector = TfidfVectorizer(stop_words=L1_stopwords, tokenizer=useless_tokenize, ngram_range=(1, 2))
directory = 'data/models/TF-IDF L1/'

### kNN

In [32]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', KNeighborsClassifier(n_neighbors = 40, p = 2, weights = 'distance'))
])
pipeline.fit(X_train, y_train)
with open(directory + "kNN.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Naive Bayes

In [33]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', MultinomialNB(alpha=0.075))
])
pipeline.fit(X_train, y_train)
with open(directory + "NB.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Decision Tree

In [34]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', DecisionTreeClassifier(max_depth=2000, min_samples_leaf=4,
                                  min_samples_split=500, max_leaf_nodes=253))
])
pipeline.fit(X_train, y_train)
with open(directory + "DT.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Support vector machine

In [35]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', SVC(C=56.0, gamma=0.01))
])
pipeline.fit(X_train, y_train)
with open(directory + "SVM.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Logistic Regression (OvR)

In [36]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', LogisticRegression(C=4.5, penalty='l1', solver='liblinear', multi_class='ovr'))
])
pipeline.fit(X_train, y_train)
with open(directory + "LR.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Random forest

In [37]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', RandomForestClassifier(max_depth=2000, max_features=100, min_samples_split=25, n_estimators=256))
])
pipeline.fit(X_train, y_train)
with open(directory + "RF.pkl", 'wb') as f:
    pickle.dump(pipeline, f)

### Softmax regression

In [38]:
pipeline = Pipeline([
    ('vectorizer', tfidf_vector),
    ('clf', LogisticRegression(C=2, penalty='l1', solver='saga', multi_class='multinomial'))
])
pipeline.fit(X_train, y_train)
with open(directory + "SR.pkl", 'wb') as f:
    pickle.dump(pipeline, f)