# Libaries

In [11]:
# Data-handling libraries
import pandas as pd

# Text-handling libaries
from nltk.corpus import stopwords

# Reshaping imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Modeling imports
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC

# Other imports
import matplotlib.pyplot as plt

# Options
seed = 5777

# Model data

## Import data

In [2]:
# Don't "intelligently" (in BIG scare quotes) filter out NA values
# or else we end up dropping naan bread (spelled "nan"). :facepalm:
df = pd.read_csv("data/Dishes-by-year.csv.bz2", na_filter=False)

## Split data

In [48]:
X = df["name"]
y = df["decade"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=seed)

In [51]:
v = TfidfVectorizer()
words = v.fit_transform(X_train)

In [54]:
words.shape

(955070, 51876)

## Define baseline

In [47]:
# Baseline probability allocation
y.value_counts(normalize=True)

1900    0.316122
1910    0.298786
1930    0.096658
1950    0.064290
1940    0.058035
1960    0.049328
1890    0.044186
1970    0.019835
1920    0.017144
1980    0.016135
1880    0.010011
1990    0.003024
1860    0.002271
1850    0.001762
2000    0.001164
2010    0.001112
1870    0.000137
Name: decade, dtype: float64

## Options for all models

In [71]:
cv_folds = 3
verbosity = 2

tfidf = TfidfVectorizer(
                stop_words = "english",
                strip_accents = "ascii"
            )

## Naïve Bayes classifier

In [66]:
pipe1 = Pipeline([
        ("tfidf", tfidf),
        ("mbayes", MultinomialNB())
    ])

grid1 = GridSearchCV(pipe1, {
    "tfidf__max_features": [10_000, 50_000],
}, cv=cv_folds, verbose=verbosity)

In [67]:
grid1.fit(X_train, y_train)
print(grid1.best_score_, "with", grid1.best_params_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] tfidf__max_features=10000 .......................................
[CV]  tfidf__max_features=10000, score=0.45199317755629614, total=  10.3s
[CV] tfidf__max_features=10000 .......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.0s remaining:    0.0s


[CV]  tfidf__max_features=10000, score=0.4509263156572161, total=  10.0s
[CV] tfidf__max_features=10000 .......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   34.2s remaining:    0.0s


[CV]  tfidf__max_features=10000, score=0.45201051669383796, total=  10.5s
[CV] tfidf__max_features=50000 .......................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   50.8s remaining:    0.0s


[CV]  tfidf__max_features=50000, score=0.45217221958719817, total=  10.0s
[CV] tfidf__max_features=50000 .......................................
[CV]  tfidf__max_features=50000, score=0.45018815295987535, total=  12.0s
[CV] tfidf__max_features=50000 .......................................
[CV]  tfidf__max_features=50000, score=0.4509833485680899, total=   9.4s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.7min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...,
        vocabulary=None)), ('mbayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tfidf__max_features': [10000, 50000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

## Decision tree classifier

In [72]:
pipe2 = Pipeline([
        ("tfidf", tfidf),
        ("tree", DecisionTreeClassifier())
    ])

grid2 = GridSearchCV(pipe2, {
    "tfidf__max_features": [100, 1000, 10_000],
    "tree__max_depth": [10, 50, 100],
    "tree__min_samples_split": [10, 50, 100],
}, cv=cv_folds, verbose=verbosity)

In [73]:
grid2.fit(X_train, y_train)
print(grid2.best_score_, "with", grid2.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=10 
[CV]  tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=10, total=  12.4s
[CV] tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.5s remaining:    0.0s


[CV]  tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=10, total=  13.8s
[CV] tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=10 
[CV]  tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=10, total=  12.5s
[CV] tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=50 
[CV]  tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=50, total=  13.0s
[CV] tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=50 
[CV]  tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=50, total=  13.7s
[CV] tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=50 
[CV]  tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=50, total=  14.0s
[CV] tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=100 
[CV]  tfidf__max_features=100, tree__max_depth=10, tree__min_samples_split=100, total=  13.4s
[CV] tfidf__max_features=100, tree__max_depth=10, 

KeyboardInterrupt: 

## Random forest classifier

In [9]:
cv_folds = 3

pipe3 = Pipeline([
        ("tfidf", tfidf),
        ("rf", RandomForestClassifier())
    ])

grid3 = GridSearchCV(pipe3, {
     "tfidf__max_features": [100, 1000, 10_000],
    "rf__n_estimators": [5, 10],
}, cv=cv_folds, verbose=verbosity)

In [10]:
grid3.fit(X_train, y_train)
print(grid3.best_score_, "with", grid3.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05 
[CV]  rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05, score=0.3228253460693992, total=  10.4s
[CV] rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.2s remaining:    0.0s


[CV]  rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05, score=0.3221593300623826, total=  10.5s
[CV] rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   36.4s remaining:    0.0s


[CV]  rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05, score=0.3231810171791513, total=  14.2s
[CV] rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=20000, tfidf__min_df=0.05 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   59.4s remaining:    0.0s


[CV]  rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=20000, tfidf__min_df=0.05, score=0.3228253460693992, total=  10.9s
[CV] rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=20000, tfidf__min_df=0.05 
[CV]  rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=20000, tfidf__min_df=0.05, score=0.3221593300623826, total=  10.9s
[CV] rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=20000, tfidf__min_df=0.05 
[CV]  rf__n_estimators=5, tfidf__max_df=0.95, tfidf__max_features=20000, tfidf__min_df=0.05, score=0.3231810171791513, total=  10.7s
[CV] rf__n_estimators=10, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05 
[CV]  rf__n_estimators=10, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05, score=0.3228253460693992, total=  13.3s
[CV] rf__n_estimators=10, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05 
[CV]  rf__n_estimators=10, tfidf__max_df=0.95, tfidf__max_features=10000, tfidf__min_df=0.05, sco

[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  4.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tfidf__max_features': [10000, 20000], 'tfidf__min_df': [0.05], 'tfidf__max_df': [0.95], 'rf__n_estimators': [5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

# Evaluate models

In [None]:
def predict_decade(term):
    fig, ax = plt.subplots(1, 3, figsize=(24, 5))
    for name, model, axis in zip(["Mult. naïve Bayes", "Decision tree", "Random Forest"], [grid1, grid2, grid3], ax.ravel()):

        axis.set_title(f"{name} prediction for \"{term}\"", fontsize=20)
        axis.bar(
           list(range(1850, 2020, 10)),
           model.predict_proba([term])[0]
        );

predict_decade("jell-o")
predict_decade("tofu")