In [1]:
# Yongqing LIAO 1155161159

import glob
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

from joblib import dump

from pre_process import split

def logistic_regression_count_bigram(train, test):
    """Train a logistic regression classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with bigram CountVectorize...')
    # Extract documents and labels.
    docs_train = train['text']
    labels_train = train['label']
    docs_test = test['text']
    labels_test = test['label']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', CountVectorizer(ngram_range=(1,2))),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))
    dump(pipe, "count_model.pkl")

def logistic_regression_tfidf_bigram(train, test):
    """Train a logistic regression classifier with count vectorizer.
    :param training set. pandas Dataframe.
    :param test set. pandas Dataframe.
    :param model save path. str. None for don't save.
    :return sklearn model.
    """
    print('Training Logistic Regression model with bigram TfidfVectorizer...')
    # Extract documents and labels.
    docs_train = train['text']
    labels_train = train['label']
    docs_test = test['text']
    labels_test = test['label']
    # Start up a Pipeline
    pipe = Pipeline([
        ('vec', TfidfVectorizer(ngram_range=(1,2))),
        ('log', LogisticRegression())
    ])
    # Train the model.
    pipe.fit(docs_train, labels_train)
    # Do prediction.
    y_pred = pipe.predict(docs_test)
    # Get report.
    print(classification_report(labels_test, y_pred))
    dump(pipe, "tfidf_model.pkl")

if __name__ == '__main__':
    train, test = split('G:\\machine learning\\Assign2\\', True, 'G:\\machine learning\\Assign2\\')
    logistic_regression_count_bigram(train, test)
    logistic_regression_tfidf_bigram(train, test)

Training Logistic Regression model with bigram CountVectorize...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

        FAKE       0.91      0.93      0.92       627
        REAL       0.93      0.91      0.92       640

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267

Training Logistic Regression model with bigram TfidfVectorizer...
              precision    recall  f1-score   support

        FAKE       0.89      0.91      0.90       627
        REAL       0.91      0.89      0.90       640

    accuracy                           0.90      1267
   macro avg       0.90      0.90      0.90      1267
weighted avg       0.90      0.90      0.90      1267

