# 1. Baseline model

In [1]:
# Imports
import pandas as pd

from sklearn.ensemble import RandomForestClassifier

# this class builds a pipeline given a transformer and a classifier
from get_pipeline import BuildPipeline

# save the scores to a json file
from helpers import separate_X_y, write_to_json, get_scores_dict, save_model

## Read data

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## Separate `X` and `y`

In [3]:
X_train, y_train = separate_X_y(train, 'title', 'is_worldnews')
X_test, y_test = separate_X_y(test, 'title', 'is_worldnews')

## CountVectorizer + RandomForest

In [4]:
bp = BuildPipeline(X_train, y_train)
# build pipeline
bp.get_pipeline('cvec', RandomForestClassifier, n_jobs=-1);
# fit pipeline
bp.grid_search();

In [5]:
train_acc = bp.pipe.score(X_train, y_train)
train_acc

0.9910814419225634

In [6]:
test_acc = bp.pipe.score(X_test, y_test)
test_acc

0.8535725728933035

In [7]:
write_to_json(get_scores_dict("cvec_rf", train_acc, test_acc))

## TfidfVectorizer + RandomForest

In [8]:
bp = BuildPipeline(X_train, y_train)
# build pipeline
bp.get_pipeline('tvec', RandomForestClassifier, n_jobs=-1);
# fit pipeline
bp.grid_search();

In [9]:
train_acc = bp.pipe.score(X_train, y_train)

In [10]:
test_acc = bp.pipe.score(X_test, y_test)

In [11]:
write_to_json(get_scores_dict("tvec_rf", train_acc, test_acc))

---


# 2. Train on lemmatized data

In [2]:
train_lm = pd.read_csv("../data/train_lemmatized.csv")
test_lm = pd.read_csv("../data/test_lemmatized.csv")

In [3]:
X_train_lm, y_train = separate_X_y(train_lm, 'new_title', 'is_worldnews')
X_test_lm, y_test = separate_X_y(test_lm, 'new_title', 'is_worldnews')

In [6]:
%%time
# fit pipeline
bp = BuildPipeline(X_train_lm, y_train)
bp.get_pipeline('cvec', RandomForestClassifier, n_jobs=-1)

param_grid = {
    'cvec__ngram_range': [(1,1), (1,2)],
    'cvec__min_df': [0, 0.01, 0.05, 0.1], 
    'cvec__max_df': [0.9, 0.95],
    'rfc__max_depth': [None, *range(1, 5)],
    'rfc__n_estimators': [100, 300, 500]
}

gs = bp.grid_search(param_grid, cv=10, random=True, n_iter=50, n_jobs=-1)

CPU times: total: 31min 3s
Wall time: 2h 5min 9s


In [7]:
gs.best_params_

{'rfc__n_estimators': 100,
 'rfc__max_depth': None,
 'cvec__ngram_range': (1, 2),
 'cvec__min_df': 0,
 'cvec__max_df': 0.95}

In [8]:
train_acc_lm = gs.score(X_train_lm, y_train)
train_acc_lm

0.9908930669800236

In [9]:
test_acc_lm = gs.score(X_test_lm, y_test)
test_acc_lm

0.8561128024355071

In [14]:
write_to_json(get_scores_dict("lm_cvec_rf", train_acc_lm, test_acc_lm))

In [11]:
save_model('random_forest', gs)