# 1. Baseline model

In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

# this class builds a pipeline given a transformer and a classifier
from get_pipeline import BuildPipeline

# save the scores to a json file
from helpers import separate_X_y, write_to_json, get_scores_dict, save_model

## Read data

In [19]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [20]:
train.shape

(37450, 5)

In [21]:
test.shape

(12484, 5)

## Baseline

In [5]:
train['is_worldnews'].value_counts(normalize=True).max()

0.5007476635514019

In [6]:
scores = {"Baseline": train['is_worldnews'].value_counts(normalize=True).max()}

write_to_json(scores)

## Separate `X` and `y`

In [22]:
X_train, y_train = separate_X_y(train, 'title', 'is_worldnews')
X_test, y_test = separate_X_y(test, 'title', 'is_worldnews')

## CountVectorizer + Logistic Regression

In [8]:
bp = BuildPipeline(X_train, y_train)
# build pipeline
bp.get_pipeline('cvec', LogisticRegression, max_iter=1000);
# fit pipeline
bp.grid_search();

In [9]:
train_acc = bp.pipe.score(X_train, y_train)
train_acc

0.9454472630173565

In [10]:
test_acc = bp.pipe.score(X_test, y_test)
test_acc

0.8588593399551426

In [11]:
write_to_json(get_scores_dict("cvec_lr", train_acc, test_acc))

## TfidfVectorizer + Logistic Regression

In [12]:
bp = BuildPipeline(X_train, y_train)
# build pipeline
bp.get_pipeline('tvec', LogisticRegression);
# fit pipeline
bp.grid_search();

In [13]:
train_acc = bp.pipe.score(X_train, y_train)

In [14]:
test_acc = bp.pipe.score(X_test, y_test)

In [15]:
write_to_json(get_scores_dict("tvec_lr", train_acc, test_acc))

# 2. Grid Search

In [23]:
bp = BuildPipeline(X_train, y_train)
# build pipeline
bp.get_pipeline('cvec', LogisticRegression);
bp.get_params()

{'memory': None,
 'steps': [('cvec', CountVectorizer()), ('lr', LogisticRegression())],
 'verbose': False,
 'cvec': CountVectorizer(),
 'lr': LogisticRegression(),
 'cvec__analyzer': 'word',
 'cvec__binary': False,
 'cvec__decode_error': 'strict',
 'cvec__dtype': numpy.int64,
 'cvec__encoding': 'utf-8',
 'cvec__input': 'content',
 'cvec__lowercase': True,
 'cvec__max_df': 1.0,
 'cvec__max_features': None,
 'cvec__min_df': 1,
 'cvec__ngram_range': (1, 1),
 'cvec__preprocessor': None,
 'cvec__stop_words': None,
 'cvec__strip_accents': None,
 'cvec__token_pattern': '(?u)\\b\\w\\w+\\b',
 'cvec__tokenizer': None,
 'cvec__vocabulary': None,
 'lr__C': 1.0,
 'lr__class_weight': None,
 'lr__dual': False,
 'lr__fit_intercept': True,
 'lr__intercept_scaling': 1,
 'lr__l1_ratio': None,
 'lr__max_iter': 100,
 'lr__multi_class': 'auto',
 'lr__n_jobs': None,
 'lr__penalty': 'l2',
 'lr__random_state': None,
 'lr__solver': 'lbfgs',
 'lr__tol': 0.0001,
 'lr__verbose': 0,
 'lr__warm_start': False}

In [24]:
%%time
# fit pipeline


param_grid = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__min_df': [1, 2, 3], 
    'cvec__max_df': [0.9, 0.95],
    'lr__penalty': ['l1', 'l2'],
    'lr__solver': ['liblinear'],
    'lr__max_iter': [1000],
    'lr__C': np.logspace(-4, 1, 50)
}

gs = bp.grid_search(param_grid, cv=10, random=True, n_iter=50)

CPU times: total: 21min 10s
Wall time: 18min 9s


In [25]:
gs.best_params_

{'lr__solver': 'liblinear',
 'lr__penalty': 'l2',
 'lr__max_iter': 1000,
 'lr__C': 1.5264179671752334,
 'cvec__ngram_range': (1, 2),
 'cvec__min_df': 1,
 'cvec__max_df': 0.9}

In [26]:
train_acc = gs.score(X_train, y_train)
train_acc

0.9899599465954606

In [27]:
test_acc = gs.score(X_test, y_test)
test_acc

0.8671900032041012

### Grid Search: Tfidf

In [28]:
bp = BuildPipeline(X_train, y_train)
# build pipeline
bp.get_pipeline('tvec', LogisticRegression, max_iter=1000);

In [None]:
%%time
# fit pipeline


param_grid = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__min_df': [1, 2, 3], 
    'cvec__max_df': [0.9, 0.95],
    'lr__penalty': ['l1', 'l2'],
    'lr__solver': ['liblinear'],
    'lr__max_iter': [1000],
    'lr__C': np.logspace(-4, 1, 50)
}

gs = bp.grid_search(param_grid, cv=10, random=True, n_iter=50)

In [None]:
gs.best_params_

In [None]:
train_acc = gs.score(X_train, y_train)
train_acc

In [None]:
test_acc = gs.score(X_test, y_test)
test_acc

# 2. Train on lemmatized data

In [5]:
train_lm = pd.read_csv("../data/train_lemmatized.csv")
test_lm = pd.read_csv("../data/test_lemmatized.csv")

In [6]:
X_train_lm, y_train = separate_X_y(train_lm, 'new_title', 'is_worldnews')
X_test_lm, y_test = separate_X_y(test_lm, 'new_title', 'is_worldnews')

In [8]:
%%time
# fit pipeline
bp = BuildPipeline(X_train_lm, y_train)
bp.get_pipeline('cvec', LogisticRegression)

param_grid = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__min_df': [1, 2, 3], 
    'cvec__max_df': [0.9, 0.95],
    'lr__penalty': ['l1', 'l2'],
    'lr__solver': ['liblinear'],
    'lr__max_iter': [1000],
    'lr__C': np.logspace(-4, 1, 50)
}

gs = bp.grid_search(param_grid, cv=10, random=True, n_iter=50)



CPU times: total: 38min 39s
Wall time: 39min 16s


In [10]:
gs.best_params_

{'lr__solver': 'liblinear',
 'lr__penalty': 'l2',
 'lr__max_iter': 1000,
 'lr__C': 1.5264179671752334,
 'cvec__ngram_range': (1, 2),
 'cvec__min_df': 1,
 'cvec__max_df': 0.95}

In [11]:
train_acc_lm = gs.score(X_train_lm, y_train)
train_acc_lm

0.9889434889434889

In [12]:
test_acc_lm = gs.score(X_test_lm, y_test)
test_acc_lm

0.8675692997917

In [16]:
write_to_json(get_scores_dict("lm_cvec_lr", train_acc_lm, test_acc_lm))

In [15]:
save_model("logreg", gs)