# IEMS5780 - Assignment 1

> 1155130306 Junru Zhong 鍾鈞儒
>
> Last modified Oct 13, 2019

In [4]:
# Imports
import glob
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## 1. Data Preparation

The following function is used to do the combining and spliting work. It returns a tuple of pandas dataframes, which corresponding to training and test set.

Because I am working on a Windows machine, the paths were written in black slashes '\'. If you are using a Unix machine, please modify them to the slashes '/'.

In [2]:
def combine(dataset_path, is_shuffle=False, save_path=None):
    """Combine the train and test dataset.
    :param: dataset_path: str
    :param: is_shuffle: boolean
    :param: save_path: str, None for don't save
    :return: (training_dataframe, test_dataframe): tuple
    """
    print('Date pre-processing...')
    data = []
    # Open files in positive comments.
    for filename in glob.glob(dataset_path + 'train\\pos\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [[f.read().strip(), 1]]
    for filename in glob.glob(dataset_path + 'test\\pos\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [[f.read().strip(), 1]]
    # Open files in negative comments.
    for filename in glob.glob(dataset_path + 'train\\neg\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [[f.read().strip(), 0]]
    for filename in glob.glob(dataset_path + 'test\\neg\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [[f.read().strip(), 0]]

    # Load datalist into DataFrame
    df = pd.DataFrame(data, columns=['comment', 'attitude'])
    # Shuffle
    if is_shuffle:
        df = df.sample(frac=1)
    # Split the dataset
    df_train, df_test = train_test_split(df, test_size=0.3)
    # Save DataFrame to csv file.
    if save_path is not None:
        with open(save_path + 'train.csv', 'w', encoding='utf8') as f:
            df_train.to_csv(f)
        with open(save_path + 'test.csv', 'w', encoding='utf8') as f:
            df_test.to_csv(f)
    # Return the dataframe.
    return df_train, df_test

## 2. Using a Naive Bayes Classification

In this section, a pipeline will be built to read the data, then count it by `CountVectorizer` and `TfidfVectorizer`, then train a Naive Bayes Classifier.

In [3]:
def naive_bayes_count(train, test):
    """Train a Naive Bayes classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Naive Bayes model with unigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer()),
        ('nb', MultinomialNB())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

Use `TfidfVectorizer`

In [4]:
def naive_bayes_tfidf(train, test):
    """Train a Naive Bayes classifier with Tf-Idf vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Naive Bayes model with unigram TfidfVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', TfidfVectorizer()),
        ('nb', MultinomialNB())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

## 3. Using Logistic Regression

In this section, a pipeline will be built to read the data, then count it by CountVectorizer and TfidfVectorizer, then train a logistic regression classifier.

In [5]:
def logistic_regression_count(train, test):
    """Train a logistic regression classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with unigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer()),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

Use `TfidfVectorizer`.

In [6]:
def logistic_regression_tfidf(train, test):
    """Train a logistic regression classifier with Tf-idf vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with unigram TfidfVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', TfidfVectorizer()),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

## 4. Bi-Gram Models
Repeat all experiments with bi-gram.

### Naive Bayes Models

In [7]:
def naive_bayes_count_bigram(train, test):
    """Train a Naive Bayes classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Naive Bayes model with bigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer(ngram_range=(1,2))),
        ('nb', MultinomialNB())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

### Logistic Regression Models

In [8]:
def logistic_regression_count_bigram(train, test):
    """Train a logistic regression classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with biigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer(ngram_range=(1,2))),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

In [11]:
# Data preprocessing Please fill your path of dataset and output file.
train, test = combine('D:\\Datasets\\aclImdb\\', True, None)
# Run all models.
naive_bayes_count(train, test)
naive_bayes_tfidf(train, test)
logistic_regression_count(train, test)
logistic_regression_tfidf(train, test)
naive_bayes_count_bigram(train, test)
logistic_regression_count_bigram(train, test)

## Models Compares

| Model | Accuracy | Precision (pos) | Recall (pos) | Percision (neg) | Recall (neg) |
| :---: | :------: | :-------------: | :----------: | :-------------: | :----------: |
| NB-Unigram | 0.84 | 0.87 | 0.81 | 0.82 | 0.88 |
| NB-Tfidf | 0.86 | 0.88 | 0.83 | 0.84 | 0.88 |
| Logistic-Unigram | 0.88 | 0.88 | 0.89 | 0.89 | 0.88 |
| Logistic-Tfidf | 0.89 | 0.89 | 0.90 | 0.90 | 0.88 |
| NB-Bigram | 0.88 | 0.89 | 0.86 | 0.87 | 0.90 |
| Logistic-Bigram | 0.91 | 0.90 | 0.91 | 0.91 | 0.90 |

From the results above, we can see the best-performed model was **Logistics Regression model with bigram CountVectorizer**. So this model will be saved on the later section and used for the Telegram chatbot.


## 5. fastText
Now train a fastText model on the movie comments.

In [11]:
# Preprocess the data by fastText format.
import csv

def pre_process_fasttext(dataset_path, save_path):
    """Dump training set and test set from txt files with labels.
    :param path to dataset. str
    :param path to save the processed data. str
    """
    data = []
    # Open files in positive comments.
    for filename in glob.glob(dataset_path + 'train\\pos\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [['__label__positive ' + f.read().strip()]]
    for filename in glob.glob(dataset_path + 'test\\pos\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [['__label__positive ' + f.read().strip()]]
    # Open files in negative comments.
    for filename in glob.glob(dataset_path + 'train\\neg\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [['__label__negaitive ' + f.read().strip()]]
    for filename in glob.glob(dataset_path + 'test\\neg\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [['__label__negaitive ' + f.read().strip()]]
    
    # Load datalist into DataFrame
    df = pd.DataFrame(data, columns=['comment_label'])
    df = df.sample(frac=1)
    # Split the dataset
    df_train, df_test = train_test_split(df, test_size=0.3)
    # Save DataFrame to csv file.
    with open(save_path + 'train.txt', 'w', encoding='utf8') as f:
        df_train.to_csv(f, header=None, index=None, mode='a', quoting=csv.QUOTE_NONE, escapechar='\\')
    with open(save_path + 'test.txt', 'w', encoding='utf8') as f:
        df_test.to_csv(f, header=None, index=None, mode='a', quoting=csv.QUOTE_NONE, escapechar='\\')

# Change to your dataset path.
pre_process_fasttext('D:\\Datasets\\aclImdb\\', 'D:\\Datasets\\aclImdb\\fastText\\')

In [15]:
# Train a fastText model.
from fasttext import train_supervised

def train_fasttext(train_path, test_path, epoch, learning_rate, n_gram):
    model = train_supervised(
        input=train_path,
        epoch=epoch,
        lr=learning_rate,
        wordNgrams=n_gram,
        verbose=2,
        minCount=1
    )
    print(model.test(test_path))
    return model

# Call the train function
ft_model = train_fasttext('D:\\Datasets\\aclImdb\\fastText\\train.txt', 'D:\\Datasets\\aclImdb\\fastText\\test.txt', 25, 1, 2)
# Save the model to current directory
ft_model.save_model('imdb_comments_ft.bin')

From the test output, we can see the accruacy is around 0.89.

## 6. Model Saving

From the last section, the bigram logistic regression model has the best performance. So let's add the save statements on the code then save this model.

In [12]:
from joblib import dump

def logistic_regression_count_bigram_save(train, test, path):
    """Train a logistic regression classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with biigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer(ngram_range=(1,2))),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))
    dump(pipe, path)

logistic_regression_count_bigram_save(train, test, 'model.joblib')