Take the work we did in the lessons further:

- What other types of models (i.e. different classifcation algorithms) could you use?
- How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

In [26]:
import acquire
import prepare

from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

import nltk

from env import get_db_url

In [2]:
df = pd.read_sql("SELECT * FROM spam", get_db_url("spam_db"))

In [24]:
df.head()

Unnamed: 0,id,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.head()
train, validate, test = prepare.train_validate_test_split(df, target = 'label')

In [4]:
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(train.text)
X_validate = tfidf.transform(validate.text)
X_test = tfidf.transform(test.text)
y_train = train.label
y_validate = validate.label
y_test = test.label

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_tfidf['predicted_log_reg'] = lm.predict(X_train)
validate_results_tfidf['predicted_log_reg'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_log_reg)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_tfidf.predicted_log_reg, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_log_reg))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_log_reg)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_tfidf.predicted_log_reg, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_log_reg))


Accuracy: 96.95%
---
Train Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                2700    94
spam                  1   324
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      2701
        spam       1.00      0.78      0.87       418

    accuracy                           0.97      3119
   macro avg       0.98      0.89      0.93      3119
weighted avg       0.97      0.97      0.97      3119

Accuracy: 95.67%
---
Validate Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                1158    58
spam                  0   122
---
              precision    recall  f1-score   support

         ham       0.95      1.00      0.98      1158
        spam       1.00      0.68      0.81       180

    accuracy                           0.96      1338
   macro avg       0.98      0.84      0.89      1338
weighted avg       0.96      0.96      0.95      1338



In [5]:
rf = RandomForestClassifier(max_depth = 40).fit(X_train, y_train)

train_results_tfidf['predicted_rf'] = rf.predict(X_train)
validate_results_tfidf['predicted_rf'] = rf.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

In [6]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_rf)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_tfidf.predicted_rf, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_rf))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_rf)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_tfidf.predicted_rf, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_rf))

Accuracy: 99.01%
---
Train Confusion Matrix
actual         ham  spam
predicted_rf            
ham           2701    31
spam             0   387
---
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      2701
        spam       1.00      0.93      0.96       418

    accuracy                           0.99      3119
   macro avg       0.99      0.96      0.98      3119
weighted avg       0.99      0.99      0.99      3119

Accuracy: 96.26%
---
Validate Confusion Matrix
actual         ham  spam
predicted_rf            
ham           1158    50
spam             0   130
---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1158
        spam       1.00      0.72      0.84       180

    accuracy                           0.96      1338
   macro avg       0.98      0.86      0.91      1338
weighted avg       0.96      0.96      0.96      1338



**Takeaway**
 - Similar results to tfidf 

### Trying CountVectorizer

In [7]:
cv = CountVectorizer()

X_train = cv.fit_transform(train.text)
X_validate = cv.transform(validate.text)
X_test = cv.transform(test.text)
y_train = train.label
y_validate = validate.label
y_test = test.label

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_cv['predicted_log_reg'] = lm.predict(X_train)
validate_results_cv['predicted_log_reg'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_log_reg)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_cv.predicted_log_reg, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_log_reg))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_log_reg)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_cv.predicted_log_reg, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_log_reg))

Accuracy: 99.71%
---
Train Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                2701     9
spam                  0   409
---
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      2701
        spam       1.00      0.98      0.99       418

    accuracy                           1.00      3119
   macro avg       1.00      0.99      0.99      3119
weighted avg       1.00      1.00      1.00      3119

Accuracy: 97.38%
---
Validate Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                1157    34
spam                  1   146
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1158
        spam       0.99      0.81      0.89       180

    accuracy                           0.97      1338
   macro avg       0.98      0.91      0.94      1338
weighted avg       0.97      0.97      0.97      1338



## Trying RF with Count Vectorizer

In [8]:
cv = CountVectorizer()

X_train = cv.fit_transform(train.text)
X_validate = cv.transform(validate.text)
X_test = cv.transform(test.text)
y_train = train.label
y_validate = validate.label
y_test = test.label

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

rf = RandomForestClassifier(max_depth=40).fit(X_train, y_train)

train_results_cv['predicted_rf'] = rf.predict(X_train)
validate_results_cv['predicted_rf'] = rf.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_rf)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_cv.predicted_rf, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_rf))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_rf)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_cv.predicted_rf, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_rf))

Accuracy: 98.78%
---
Train Confusion Matrix
actual         ham  spam
predicted_rf            
ham           2701    38
spam             0   380
---
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      2701
        spam       1.00      0.91      0.95       418

    accuracy                           0.99      3119
   macro avg       0.99      0.95      0.97      3119
weighted avg       0.99      0.99      0.99      3119

Accuracy: 95.89%
---
Validate Confusion Matrix
actual         ham  spam
predicted_rf            
ham           1158    55
spam             0   125
---
              precision    recall  f1-score   support

         ham       0.95      1.00      0.98      1158
        spam       1.00      0.69      0.82       180

    accuracy                           0.96      1338
   macro avg       0.98      0.85      0.90      1338
weighted avg       0.96      0.96      0.96      1338



### Clean Data Update

In [9]:
clean_and_lem_df = df.copy()
clean_and_lem_df['lem'] = df.text.apply(prepare.basic_clean).apply(prepare.tokenize).apply(prepare.lemmatize).apply(prepare.remove_stopwords,
                                                       extra_words = [],
                                                       exclude_words = [])

In [10]:
df.head()

Unnamed: 0,id,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
# Lemmatization
clean_and_lem_df.head()

Unnamed: 0,id,label,text,lem
0,0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think go usf life around though


In [12]:
#Split
train, validate, test = prepare.train_validate_test_split(clean_and_lem_df, target = 'label')

In [13]:
cv = CountVectorizer()

X_train = cv.fit_transform(train.lem)
X_validate = cv.transform(validate.lem)
X_test = cv.transform(test.lem)
y_train = train.label
y_validate = validate.label
y_test = test.label

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_cv['predicted_log_reg'] = lm.predict(X_train)
validate_results_cv['predicted_log_reg'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_log_reg)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_cv.predicted_log_reg, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_log_reg))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_log_reg)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_cv.predicted_log_reg, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_log_reg))

Accuracy: 99.36%
---
Train Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                2700    19
spam                  1   399
---
              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      2701
        spam       1.00      0.95      0.98       418

    accuracy                           0.99      3119
   macro avg       1.00      0.98      0.99      3119
weighted avg       0.99      0.99      0.99      3119

Accuracy: 97.01%
---
Validate Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                1157    39
spam                  1   141
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1158
        spam       0.99      0.78      0.88       180

    accuracy                           0.97      1338
   macro avg       0.98      0.89      0.93      1338
weighted avg       0.97      0.97      0.97      1338



# Final takeaways:
 - I'm not seeing large difference between models.  
 - Hoping News data will make model differences clear


# News Data

In [14]:
# import News
news = prepare.create_prepared_news_df()
target = 'category'

Importing from csv


In [15]:
# Split
train, validate, test = prepare.train_validate_test_split(news, target = 'category')

In [16]:
# Count vector model
cv = CountVectorizer()

X_train = cv.fit_transform(train.lemmatized)
X_validate = cv.transform(validate.lemmatized)
X_test = cv.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_cv['predicted_log_reg'] = lm.predict(X_train)
validate_results_cv['predicted_log_reg'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_log_reg)))
# print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_cv.predicted_log_reg, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_log_reg))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_log_reg)))
# print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_cv.predicted_log_reg, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_log_reg))

Accuracy: 100.00%
---
               precision    recall  f1-score   support

     business       1.00      1.00      1.00        14
entertainment       1.00      1.00      1.00        14
       sports       1.00      1.00      1.00        14
   technology       1.00      1.00      1.00        14

     accuracy                           1.00        56
    macro avg       1.00      1.00      1.00        56
 weighted avg       1.00      1.00      1.00        56

Accuracy: 54.17%
---
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         6
entertainment       1.00      0.83      0.91         6
       sports       1.00      1.00      1.00         6
   technology       0.25      0.33      0.29         6

     accuracy                           0.54        24
    macro avg       0.56      0.54      0.55        24
 weighted avg       0.56      0.54      0.55        24



In [17]:
# Using Bigrams
cv = CountVectorizer(ngram_range=(2,2))

X_train = cv.fit_transform(train.lemmatized)
X_validate = cv.transform(validate.lemmatized)
X_test = cv.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_cv['predicted_log_reg_bigrams'] = lm.predict(X_train)
validate_results_cv['predicted_log_reg_bigrams'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_log_reg_bigrams)))
# print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_cv.predicted_log_reg_bigrams, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_log_reg_bigrams))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_log_reg_bigrams)))
# print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_cv.predicted_log_reg_bigrams, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_log_reg_bigrams))


Accuracy: 100.00%
---
               precision    recall  f1-score   support

     business       1.00      1.00      1.00        14
entertainment       1.00      1.00      1.00        14
       sports       1.00      1.00      1.00        14
   technology       1.00      1.00      1.00        14

     accuracy                           1.00        56
    macro avg       1.00      1.00      1.00        56
 weighted avg       1.00      1.00      1.00        56

Accuracy: 41.67%
---
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         6
entertainment       0.42      0.83      0.56         6
       sports       0.83      0.83      0.83         6
   technology       0.00      0.00      0.00         6

     accuracy                           0.42        24
    macro avg       0.31      0.42      0.35        24
 weighted avg       0.31      0.42      0.35        24



In [18]:
# Bayes?
cv = CountVectorizer()

X_train = cv.fit_transform(train.lemmatized)
X_validate = cv.transform(validate.lemmatized)
X_test = cv.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_cv['predicted_nb'] = lm.predict(X_train)
validate_results_cv['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_cv.predicted_nb, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_cv.predicted_nb, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_nb))

Accuracy: 100.00%
---
---
               precision    recall  f1-score   support

     business       1.00      1.00      1.00        14
entertainment       1.00      1.00      1.00        14
       sports       1.00      1.00      1.00        14
   technology       1.00      1.00      1.00        14

     accuracy                           1.00        56
    macro avg       1.00      1.00      1.00        56
 weighted avg       1.00      1.00      1.00        56

Accuracy: 58.33%
---
---
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         6
entertainment       1.00      1.00      1.00         6
       sports       0.86      1.00      0.92         6
   technology       0.29      0.33      0.31         6

     accuracy                           0.58        24
    macro avg       0.54      0.58      0.56        24
 weighted avg       0.54      0.58      0.56        24



In [19]:
# tfidf
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(train.lemmatized)
X_validate = tfidf.transform(validate.lemmatized)
X_test = tfidf.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_tfidf['predicted_nb'] = lm.predict(X_train)
validate_results_tfidf['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_tfidf.predicted_nb, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_tfidf.predicted_nb, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb))

Accuracy: 100.00%
---
---
               precision    recall  f1-score   support

     business       1.00      1.00      1.00        14
entertainment       1.00      1.00      1.00        14
       sports       1.00      1.00      1.00        14
   technology       1.00      1.00      1.00        14

     accuracy                           1.00        56
    macro avg       1.00      1.00      1.00        56
 weighted avg       1.00      1.00      1.00        56

Accuracy: 58.33%
---
---
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         6
entertainment       1.00      1.00      1.00         6
       sports       0.86      1.00      0.92         6
   technology       0.29      0.33      0.31         6

     accuracy                           0.58        24
    macro avg       0.54      0.58      0.56        24
 weighted avg       0.54      0.58      0.56        24



In [20]:
# W/ Bigrams
tfidf = TfidfVectorizer(ngram_range=(2,2))

X_train = tfidf.fit_transform(train.lemmatized)
X_validate = tfidf.transform(validate.lemmatized)
X_test = tfidf.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_tfidf['predicted_nb'] = lm.predict(X_train)
validate_results_tfidf['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_tfidf.predicted_nb, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_tfidf.predicted_nb, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb))

Accuracy: 100.00%
---
---
               precision    recall  f1-score   support

     business       1.00      1.00      1.00        14
entertainment       1.00      1.00      1.00        14
       sports       1.00      1.00      1.00        14
   technology       1.00      1.00      1.00        14

     accuracy                           1.00        56
    macro avg       1.00      1.00      1.00        56
 weighted avg       1.00      1.00      1.00        56

Accuracy: 41.67%
---
---
               precision    recall  f1-score   support

     business       0.12      0.17      0.14         6
entertainment       0.75      0.50      0.60         6
       sports       0.86      1.00      0.92         6
   technology       0.00      0.00      0.00         6

     accuracy                           0.42        24
    macro avg       0.43      0.42      0.42        24
 weighted avg       0.43      0.42      0.42        24



In [21]:
# Words w/ bigrams
tfidf = TfidfVectorizer(ngram_range=(1,2))

X_train = tfidf.fit_transform(train.lemmatized)
X_validate = tfidf.transform(validate.lemmatized)
X_test = tfidf.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_tfidf['predicted_nb'] = lm.predict(X_train)
validate_results_tfidf['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_tfidf.predicted_nb, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_tfidf.predicted_nb, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb))

Accuracy: 100.00%
---
---
               precision    recall  f1-score   support

     business       1.00      1.00      1.00        14
entertainment       1.00      1.00      1.00        14
       sports       1.00      1.00      1.00        14
   technology       1.00      1.00      1.00        14

     accuracy                           1.00        56
    macro avg       1.00      1.00      1.00        56
 weighted avg       1.00      1.00      1.00        56

Accuracy: 58.33%
---
---
               precision    recall  f1-score   support

     business       0.00      0.00      0.00         6
entertainment       1.00      1.00      1.00         6
       sports       1.00      1.00      1.00         6
   technology       0.25      0.33      0.29         6

     accuracy                           0.58        24
    macro avg       0.56      0.58      0.57        24
 weighted avg       0.56      0.58      0.57        24



## Takeaway:
- Model accuracy is always 100%, but validation is low

In [22]:
def model_words(vectorizer, class_model, ngrams_range_value, train, validate, target, print_results = True):
    """Performs classification modeling of lemmatized data. Outputs (and returns) classification reports for train and validate/test.
    
    vectorizer: the type of feature extraction method, such as Count Vectorizer or tf-idf
    class_model: the classification model to use
    ngrams_range_value: whether to use unigram, bigrams, etc. for the feature extraction
    train and test sets as well as the target variable"""
    
    feature_extraction_method = vectorizer(ngram_range=ngrams_range_value)

    X_train = feature_extraction_method.fit_transform(train.lemmatized)
    X_validate = feature_extraction_method.transform(validate.lemmatized)
    X_test = feature_extraction_method.transform(test.lemmatized)
    y_train = train[target]
    y_validate = validate[target]
    # y_test = test[target]

    train_results=pd.DataFrame(dict(actual = y_train))
    validate_results = pd.DataFrame(dict(actual = y_validate))
    # test_results = pd.DataFrame(dict(actual = y_test))

    model_to_use = class_model.fit(X_train, y_train)

    train_results['predicted'] = model_to_use.predict(X_train)
    validate_results['predicted'] = model_to_use.predict(X_validate)
    # test_results['predicted'] = model_to_use.predict(X_test)
    train_class_report = classification_report(train_results.actual, train_results.predicted, output_dict = True)
    validate_class_report = classification_report(validate_results.actual, validate_results.predicted,output_dict=True)
    if print_results:
        print('Accuracy: {:.2%}'.format(accuracy_score(train_results.actual, train_results.predicted)))
        print('---')
        # print('Train Confusion Matrix')
        # print(pd.crosstab(train_results_tfidf.predicted, train_results_tfidf.actual))
        print('---')
        print(pd.DataFrame(train_class_report))


        print('Accuracy: {:.2%}'.format(accuracy_score(validate_results.actual, validate_results.predicted)))
        print('---')
        # print('Validate Confusion Matrix')
        # print(pd.crosstab(validate_results_tfidf.predicted, validate_results_tfidf.actual))
        print('---')
        print(pd.DataFrame(validate_class_report))
    
    return train_class_report, validate_class_report

In [23]:
vectorizers = [CountVectorizer, TfidfVectorizer]
class_models = [RandomForestClassifier(random_state=123), LogisticRegression(), DecisionTreeClassifier()]
for v in vectorizers:
    for m in class_models:
        print("----")
        print(m, v)
        
        train_class_report, validate_class_report = model_words(v, m, (1,1), train, validate, target, True)

----
RandomForestClassifier(random_state=123) <class 'sklearn.feature_extraction.text.CountVectorizer'>
Accuracy: 100.00%
---
---
           business  entertainment  sports  technology  accuracy  macro avg  \
precision       1.0            1.0     1.0         1.0       1.0        1.0   
recall          1.0            1.0     1.0         1.0       1.0        1.0   
f1-score        1.0            1.0     1.0         1.0       1.0        1.0   
support        14.0           14.0    14.0        14.0       1.0       56.0   

           weighted avg  
precision           1.0  
recall              1.0  
f1-score            1.0  
support            56.0  
Accuracy: 58.33%
---
---
           business  entertainment    sports  technology  accuracy  macro avg  \
precision  0.285714       0.833333  1.000000    0.333333  0.583333   0.613095   
recall     0.333333       0.833333  0.833333    0.333333  0.583333   0.583333   
f1-score   0.307692       0.833333  0.909091    0.333333  0.583333   0.59586

## Take away:
- I think Accuracy may not be the metric worth looking at.  
- I think my ability to understand the results is possible conflict.
- I can run code... be happy with that, Codeup... I just don't know what it means.