# Libaries

In [29]:
# Data-handling libraries
import pandas as pd

# Reshaping imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Modeling imports
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC

# Options
seed = 5777

# Model data

## Import data

In [None]:
# Don't "intelligently" (in BIG scare quotes) filter out NA values
# or else we end up dropping naan bread (spelled "nan"). :facepalm:
df = pd.read_csv("data/Dishes-by-year.csv.bz2", na_filter=False)

## Split data

In [26]:
X = df["name"]
y = df["decade"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=seed)

## Define models

In [32]:
cv_folds = 3

pipe = Pipeline([
        ("cvec", CountVectorizer()),
        #("ss", StandardScaler()),
        ("bayes", MultinomialNB())
    ])
params = {
     "cvec__max_features": [100, 500, 1000],
     "cvec__ngram_range": [(1,1), (1,2), (1, 3), (2, 3), (1, 4)],
     "cvec__min_df": [0.01],
     "cvec__max_df": [0.99],
     #"cvec__stop_words": [None, stopwords.words("english")],
    #"rf__n_estimators": [5, 10],
}

grid = GridSearchCV(pipe, params, cv=cv_folds, verbose=3)

## Run models

In [33]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 1) 
[CV]  cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 1), score=0.3100831440839545, total=   7.5s
[CV] cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   12.8s remaining:    0.0s


[CV]  cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 1), score=0.30884630244472433, total=   6.7s
[CV] cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   24.5s remaining:    0.0s


[CV]  cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 1), score=0.3098150451706937, total=   7.3s
[CV] cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 2) 
[CV]  cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 2), score=0.3095742909823064, total=  12.7s
[CV] cvec__max_df=0.99, cvec__max_features=100, cvec__min_df=0.01, cvec__ngram_range=(1, 2) 


KeyboardInterrupt: 

# Evaluate models