# IEMS5780 - Assignment 1

> 1155130306 Junru Zhong 鍾鈞儒
>
> Last modified Oct 6, 2019

In [1]:
# Imports
import glob
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## 1. Data Preparation

The following function is used to do the combining and spliting work. It returns a tuple of pandas dataframes, which corresponding to training and test set.

Because I am working on a Windows machine, the paths were written in black slashes '\'. If you are using a Unix machine, please modify them to the slashes '/'.

In [12]:
def combine(dataset_path, is_shuffle=False, save_path=None):
    """Combine the train and test dataset.
    :param: dataset_path: str
    :param: is_shuffle: boolean
    :param: save_path: str, None for don't save
    :return: (training_dataframe, test_dataframe): tuple
    """
    print('Date pre-processing...')
    data = []
    # Open files in positive comments.
    for filename in glob.glob(dataset_path + 'train\\pos\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [[f.read().strip(), 1]]
    for filename in glob.glob(dataset_path + 'test\\pos\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [[f.read().strip(), 1]]
    # Open files in negative comments.
    for filename in glob.glob(dataset_path + 'train\\neg\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [[f.read().strip(), 0]]
    for filename in glob.glob(dataset_path + 'test\\neg\\*.txt'):
        with open(filename, 'r', encoding='utf8') as f:
            data += [[f.read().strip(), 0]]

    # Load datalist into DataFrame
    df = pd.DataFrame(data, columns=['comment', 'attitude'])
    # Shuffle
    if is_shuffle:
        df = df.sample(frac=1)
    # Split the dataset
    df_train, df_test = train_test_split(df, test_size=0.3)
    # Save DataFrame to csv file.
    if save_path is not None:
        with open(save_path + 'train.csv', 'w', encoding='utf8') as f:
            df_train.to_csv(f)
        with open(save_path + 'test.csv', 'w', encoding='utf8') as f:
            df_test.to_csv(f)
    # Return the dataframe.
    return df_train, df_test

Call the function above, to get the dataframe variables. This function will be run for a while, depends on your disk speed.

In [3]:
# Run the code. Please fill your path of dataset and output file.
train, test = combine('E:\\Datasets\\aclImdb\\', True, None)

## 2. Using a Naive Bayes Classification

In this section, a pipeline will be built to read the data, then count it by `CountVectorizer` and `TfidfVectorizer`, then train a Naive Bayes Classifier.

In [7]:
def naive_bayes_count(train, test, save_path=None):
    """Train a Naive Bayes classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Naive Bayes model with unigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer()),
        ('nb', MultinomialNB())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

In [12]:
naive_bayes_count(train, test, None)

              precision    recall  f1-score   support

           0       0.83      0.88      0.85      7487
           1       0.87      0.82      0.84      7513

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



In [8]:
def naive_bayes_tfidf(train, test, save_path=None):
    """Train a Naive Bayes classifier with Tf-Idf vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Naive Bayes model with unigram TfidfVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', TfidfVectorizer()),
        ('nb', MultinomialNB())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

In [5]:
naive_bayes_tfidf(train, test, None)

              precision    recall  f1-score   support

           0       0.84      0.89      0.87      7432
           1       0.89      0.84      0.86      7568

    accuracy                           0.86     15000
   macro avg       0.86      0.86      0.86     15000
weighted avg       0.86      0.86      0.86     15000



## 3. Using Logistic Regression

In this section, a pipeline will be built to read the data, then count it by CountVectorizer and TfidfVectorizer, then train a logistic regression classifier.

In [9]:
def logistic_regression_count(train, test, save_path=None):
    """Train a logistic regression classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with unigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer()),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

In [12]:
logistic_regression_count(train, test, None)



              precision    recall  f1-score   support

           0       0.89      0.88      0.88      7432
           1       0.88      0.89      0.89      7568

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



In [10]:
def logistic_regression_tfidf(train, test, save_path=None):
    """Train a logistic regression classifier with Tf-idf vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with unigram TfidfVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', TfidfVectorizer()),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

In [15]:
logistic_regression_tfidf(train, test, None)



              precision    recall  f1-score   support

           0       0.90      0.88      0.89      7432
           1       0.88      0.91      0.90      7568

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



## 4. Bi-Gram Models
Repeat all experiments with bi-gram.

### Naive Bayes Models

In [15]:
def naive_bayes_count_bigram(train, test, save_path=None):
    """Train a Naive Bayes classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Naive Bayes model with bigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer(ngram_range=(1,2))),
        ('nb', MultinomialNB())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

### Logistic Regression Models

In [16]:
def logistic_regression_count_bigram(train, test, save_path=None):
    """Train a logistic regression classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with biigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['comment']
    labels_train = train['attitude']
    docs_test = test['comment']
    labels_test = test['attitude']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer(ngram_range=(1,2))),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))

In [17]:
# Data preprocessing Please fill your path of dataset and output file.
train, test = combine('E:\\Datasets\\aclImdb\\', True, None)
# Run all models.
naive_bayes_count(train, test, False)
naive_bayes_tfidf(train, test, False)
logistic_regression_count(train, test, False)
logistic_regression_tfidf(train, test, False)
naive_bayes_count_bigram(train, test, False)
logistic_regression_count_bigram(train, test, False)

Date pre-processing...
Training Naive Bayes model with unigram CountVectorize...
              precision    recall  f1-score   support

           0       0.83      0.88      0.85      7518
           1       0.87      0.81      0.84      7482

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000

Training Naive Bayes model with unigram TfidfVectorize...
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      7518
           1       0.88      0.84      0.86      7482

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.86     15000
weighted avg       0.87      0.87      0.86     15000

Training Logistic Regression model with unigram CountVectorize...




              precision    recall  f1-score   support

           0       0.90      0.89      0.89      7518
           1       0.89      0.90      0.89      7482

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

Training Logistic Regression model with unigram TfidfVectorize...
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      7518
           1       0.89      0.91      0.90      7482

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000

Training Naive Bayes model with bigram CountVectorize...
              precision    recall  f1-score   support

           0       0.87      0.90      0.88      7518
           1       0.90      0.86      0.88      7482

    accuracy                           0.88     15000
   ma



              precision    recall  f1-score   support

           0       0.91      0.91      0.91      7518
           1       0.91      0.91      0.91      7482

    accuracy                           0.91     15000
   macro avg       0.91      0.91      0.91     15000
weighted avg       0.91      0.91      0.91     15000



## 5. fastText
Now train a fastText model on the movie comments.

In [18]:
# Import fastText module.
import fasttext