# Baseline model

In [1]:
# Imports
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier

# this class builds a pipeline given a transformer and a classifier
from get_pipeline import BuildPipeline

# save the scores to a json file
from helpers import separate_X_y, write_to_json, get_scores_dict

## Read data

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## Separate `X` and `y`

In [3]:
X_train, y_train = separate_X_y(train, 'title', 'is_worldnews')
X_test, y_test = separate_X_y(test, 'title', 'is_worldnews')

## CountVectorizer + GradientBoostingClassifier

In [4]:
bp = BuildPipeline(X_train, y_train)
# build pipeline
bp.get_pipeline('cvec', GradientBoostingClassifier);
# fit pipeline
bp.grid_search();

In [5]:
train_acc = bp.pipe.score(X_train, y_train)
train_acc

0.7655273698264352

In [6]:
test_acc = bp.pipe.score(X_test, y_test)
test_acc

0.768423582185197

In [7]:
write_to_json(get_scores_dict("cvec_gb", train_acc, test_acc))

## TfidfVectorizer + GradientBoostingClassifier

In [8]:
bp = BuildPipeline(X_train, y_train)
# build pipeline
bp.get_pipeline('tvec', GradientBoostingClassifier);
# fit pipeline
bp.grid_search();

In [9]:
train_acc = bp.pipe.score(X_train, y_train)
train_acc

0.7658744993324432

In [10]:
test_acc = bp.pipe.score(X_test, y_test)
test_acc

0.7673822492790772

In [11]:
write_to_json(get_scores_dict("tvec_gb", train_acc, test_acc))