# Real vs. Satire Model

* Multinomial NB
* Random Forest Classifier
* Logistic Regression Classifier

In [1]:
pip install scattertext 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scattertext
  Downloading scattertext-0.1.10-py3-none-any.whl (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 5.6 MB/s 
Collecting flashtext
  Downloading flashtext-2.7.tar.gz (14 kB)
Collecting gensim>=4.0.0
  Downloading gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 84 kB/s 
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9307 sha256=3feb9dbf52a0ffd7c2a981503f6c956ca7829d2acedeb7fc58849541364f69bd
  Stored in directory: /root/.cache/pip/wheels/8d/62/8b/71813348245ae1bcbae179193bbc72db819e8057e89298a6ac
Successfully built flashtext
Installing collected packages: gensim, flashtext, scattertext
  Attempting uninstall: gensim
    Found existin

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt 
import scattertext as st
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics._plot.confusion_matrix import confusion_matrix
from yellowbrick.model_selection import FeatureImportances

from google.colab import drive

In [3]:
# Uncomment if connecting to Google Drive
# Run this cell and select your UMich Google account in the pop-up

# drive.mount('/content/gdrive')

Mounted at /content/gdrive


## Read files

In [4]:
# read files
train_df = pd.read_csv('data\processed\real_satire\updated_train_df.csv')
test_df = pd.read_csv('data\processed\real_satire\updated_test_df.csv')

In [5]:
X_train = train_df['clean_content']
y_train = train_df['label']
X_test = test_df['clean_content']
y_test = test_df['label']

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(47319,)
(47319,)
(33543,)
(33543,)


## Vectorize and transform

In [7]:
def vect_transform(X_train, X_test):
    # vectorize
    vect = TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1,2), min_df=2, max_df=0.5)
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    tfidf_transformer = TfidfTransformer()
    tfidf_transformer.fit(X_train_dtm)
    tfidf_transformer.transform(X_train_dtm)
    return vect, X_train_dtm, X_test_dtm

In [8]:
vect, X_train_dtm, X_test_dtm = vect_transform(X_train, X_test)

## Models Training

### Dummy Classifier

In [None]:
# Initialize and train dummy classifier
# the predict_proba method randomly samples one-hot vectors from a multinomial 
# distribution parametrized by the empirical class prior probabilities. 
dummy_clf = DummyClassifier(strategy="stratified",
                            random_state=42
                            ).fit(X_train, y_train)
# Get predictions - Logistic Regression
y_pred_dummy = dummy_clf.predict(X_test)
# Get model quality scores
print(metrics.classification_report(y_test, y_pred_dummy))
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, y_pred_dummy)
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', cmap="Purples")
ax.set_title("Confusion Matrix | Dummy Classifier")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Initialize and train dummy classifier
dummy_clf = DummyClassifier(strategy="most_frequent",
                            random_state=42
                            ).fit(X_train, y_train)
# Get predictions - Logistic Regression
y_pred_dummy = dummy_clf.predict(X_test)
# Get model quality scores
print(metrics.classification_report(y_test, y_pred_dummy))
# Confusion Matrix
cm = metrics.confusion_matrix(y_test, y_pred_dummy)
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', cmap="Purples")
ax.set_title("Confusion Matrix | Dummy Classifier")
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

### Multinomial Naive Bayes

In [None]:
def mul_nb(X_train_dtm, y_train, X_test_dtm, y_test):
    # Naive Bayes
    nb = MultinomialNB()
    # train the model using X_train_dtm
    nb.fit(X_train_dtm, y_train)
    # make class predictions for X_test_dtm
    y_pred_class = nb.predict(X_test_dtm)
    print('Accuracy of Naive Bayes classifier on test set: {:.2f}'.format(nb.score(X_test_dtm, y_test)))
    print(metrics.confusion_matrix(y_test, y_pred_class))
    print(metrics.classification_report(y_test, y_pred_class))
    print("---------------------------------------")
    # calculate predicted probabilities for X_test_dtm
    y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
    print('predicted probabilities for X_test_dtm: {}'.format(y_pred_prob))
    print("---------------------------------------")
    # calculate AUC
    print('AUC: {}'.format(metrics.roc_auc_score(y_test, y_pred_prob)))
    return y_pred_class

In [None]:
y_pred_class = mul_nb(X_train_dtm, y_train, X_test_dtm, y_test)

In [None]:
# example of false negative 
X_test[1000]

In [None]:
def mul_nb_pipeline():
    pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', MultinomialNB())])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred, normalize='all')
    print('Accuracy of Multi nomialNB classifier on test set: {:.2f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print("---------------------------------------")
    print(metrics.confusion_matrix(y_test, y_pred))
    print(metrics.classification_report(y_test, y_pred))
    print("---------------------------------------")
    domain_color = ["real", "satire"]
    ax = plt.subplot()
    sns.heatmap(confusion_matrix, annot=True, fmt='.2f', ax=ax, cmap="Purples")
    ax.set_title("Confusion Matrix | Multinomial NB")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    ax.set_xticklabels(["real", "satire"])
    ax.set_yticklabels(["real", "satire"])
    plt.show()
    return y_pred, confusion_matrix

In [None]:
mul_nb_pred, mul_nb_confusion_matrix = mul_nb_pipeline()

### Logistic Regression

In [None]:
def logistic_reg():
    logreg = LogisticRegression(solver='liblinear')
    logreg.fit(X_train_dtm, y_train)
    # make class predictions for X_test_dtm
    y_pred = logreg.predict(X_test_dtm)
    # calculate predicted probabilities for X_test_dtm
    y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
    cm = metrics.confusion_matrix(y_test, y_pred, normalize='all')
    print('Accuracy of Logistic Regression classifier on test set: {:.2f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('AUC: {}'.format(metrics.roc_auc_score(y_test, y_pred_prob)))
    print("---------------------------------------")
    print(cm)
    print(metrics.classification_report(y_test, y_pred))
    print("---------------------------------------")   
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, fmt='.2f', ax=ax, cmap="Purples")
    ax.set_title("Normalized Confusion Matrix | Logistic Regression Classifier")
    ax.set_xticklabels(["real", "satire"])
    ax.set_yticklabels(["real", "satire"])
    plt.xlabel('Predicted')
    plt.ylabel('True')

    plt.show()
    return y_pred_prob, confusion_matrix

In [None]:
lr_y_pred_prob, lr_matrix_confusion = logistic_reg()

### Random Forest Classifier

In [None]:
def random_forest():
    rf_clf = RandomForestClassifier(n_estimators=100, max_depth=65, random_state=42)
    rf_clf.fit(X_train_dtm, y_train)
    y_pred = rf_clf.predict(X_test_dtm)
    y_pred_prob = rf_clf.predict_proba(X_test_dtm)[:, 1]
    cm = metrics.confusion_matrix(y_test, y_pred, normalize='all')
    print('Accuracy of Random Forest Classifier on test set: {:.2f}'.format(metrics.accuracy_score(y_test, y_pred)))
    print('AUC: {}'.format(metrics.roc_auc_score(y_test, y_pred_prob)))
    print("---------------------------------------")  
    print(cm)
    print(metrics.classification_report(y_test, y_pred))
    print("---------------------------------------")  
    ax = plt.subplot()
    sns.heatmap(cm, annot=True, fmt='.2f', ax=ax, cmap="Purples")
    ax.set_title("Normalized Confusion Matrix | Random Forest Classifier")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    ax.set_xticklabels(["real", "satire"])
    ax.set_yticklabels(["real", "satire"])
    plt.show()
    return y_pred_prob, confusion_matrix

In [None]:
rd_y_pred_prob, rd_matrix_confusion = random_forest()

Logistic Regression model works the best.

## Feature Importances with Visulizations

* Random Forest Classifier
* Logistic Regression Classifier

In [None]:
# nb = MultinomialNB()
vect = TfidfVectorizer(max_features=2000, stop_words='english', ngram_range=(1,2), min_df=2, max_df=0.5)
vect.fit(X_train)

In [None]:
colors = ['#1F77B4']
# Title case the feature for better display and create the visualizer
visualizer = FeatureImportances(MultinomialNB(), labels=vect.get_feature_names(), relative=False, absolute=False, topn=20, colors=colors)
# Fit and show the feature importances
visualizer.fit(X_train_dtm, y_train)
visualizer.show()

In [13]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=65, random_state=42)
rf_clf.fit(X_train_dtm, y_train)
y_pred = rf_clf.predict(X_test_dtm)

feature_scores = pd.Series(rf_clf.feature_importances_, index=vect.get_feature_names()).sort_values(ascending=False)
rf_df = pd.DataFrame(feature_scores)
# get the top 20 features since all features have positive coef.
final_rf_df = rf_df[:20]

final_rf_df = final_rf_df.reset_index()
final_rf_df.columns = ['word', 'score']



In [22]:
rf_chart = alt.Chart(final_rf_df).mark_bar().encode(
    x=alt.X('score:Q', title='RF Feature Importance'),
    y=alt.Y('word:N', title='Features', sort='-x'),
    color=alt.value("#4DAF4A")
).properties(
    width=400,
    height=400)

In [25]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
lr_coef = logreg.coef_[0]
norm_lr_coef = np.std(X_train_dtm.toarray(),0) * lr_coef
# print(len(lr_coef))
df_mapping = vect.vocabulary_
df_map = [(x, df_mapping[x]) for x in df_mapping]
df_map.sort(key=lambda x: x[1], reverse=True)
vocab_map = [x[0] for x in df_map]

fitted_sel = SelectPercentile(chi2, percentile=60).fit(X_train_dtm, y_train)
mask = fitted_sel.get_support()
final_vocab = np.array(vocab_map)[mask]
# print(len(final_vocab))
lr_coef_df = pd.DataFrame(list(zip(final_vocab, norm_lr_coef)), columns=['word', 'coef'])
lr_coef_df = lr_coef_df.reindex(lr_coef_df.coef.abs().sort_values(ascending=False).index)
final_lr_df = lr_coef_df[:20]
final_lr_df['label'] = None

final_lr_df.loc[final_lr_df['coef'] > 0, 'label'] = 'real'
final_lr_df.loc[final_lr_df['coef'] <=0, 'label'] = 'satire'
domainColor = ["satire", "real"]
rangeColor = ["#1F77B4", "#4DAF4A"]
lr_chart = alt.Chart(final_lr_df).mark_bar().encode(
    x=alt.X('coef:Q', title='LR Weights'),
    y=alt.Y('word:N', title='Features', sort=None),
    color=alt.Color('label:N', scale=alt.Scale(domain=domainColor, range=rangeColor))).properties(
        width=400,
        height=400)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_lr_df['label'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [None]:
top_lr_df = final_lr_df[:10]
lr_chart = alt.Chart(top_lr_df).mark_bar().encode(
    x=alt.X('coef:Q', title='LR Weights'),
    y=alt.Y('word:N', title='Features', sort=None),
    color=alt.condition(
        alt.datum.coef > 0, 
        alt.value("#4DAF4A"),
        alt.value("#1F77B4"))
    ).properties(
        width=800,
        height=500)
final_chart = (lr_chart).configure_axis(
            labelFontSize=24, #10
            titleFontSize=12
        ).configure_title(
            fontSize=15
        )
# final_chart

Visulizations

In [28]:
combined_chart = (lr_chart | rf_chart).properties(
    title='Feature Importance'
        ).configure_axis(
            labelFontSize=10,
            titleFontSize=12
        ).configure_axis(
    labelFontSize=15
  )
combined_chart

## Calculate and compare accuracy rates and f1 scores

In [None]:
def models():
    return [
            RandomForestClassifier(n_estimators=100, max_depth=65, random_state=42),
            LogisticRegression(solver='liblinear'),
            MultinomialNB()
        ]
models_list = models()

In [None]:
def calculate_accuracy(models, cv=5):
    cv_df = pd.DataFrame(index=range(cv * len(models)))
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model, X_train_dtm, y_train, scoring='accuracy', cv=cv)
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
    std_accuracy = cv_df.groupby('model_name').accuracy.std()

    acc = pd.concat([mean_accuracy, std_accuracy], axis=1, ignore_index=True)
    acc.columns = ['Mean Accuracy', 'Standard Deviation']
    return cv_df, acc

In [None]:
def accuracy_plot(cv_df):
    plt.figure(figsize=(8,5))
    sns.boxplot(x='model_name', y='accuracy', 
                data=cv_df, 
                color='lightblue', 
                showmeans=True)
    plt.title("MEAN ACCURACY (cv = 5)")

In [None]:
cv_df, acc = calculate_accuracy(models=models_list, cv=5)
accuracy_plot(cv_df)
acc

In [None]:
def calculate_f1(models, cv=5):
    # models = models()
    cv_df = pd.DataFrame(index=range(cv * len(models)))
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model, X_train_dtm, y_train, scoring='f1', cv=cv)
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'f1'])
    mean_f1 = cv_df.groupby('model_name').f1.mean()
    std_f1 = cv_df.groupby('model_name').f1.std()

    acc = pd.concat([mean_f1, std_f1], axis=1, ignore_index=True)
    acc.columns = ['Mean F1', 'Standard Deviation']
    return cv_df, acc

In [None]:
def f1_plot(cv_df):
    plt.figure(figsize=(8,5))
    sns.boxplot(x='model_name', y='f1', 
                data=cv_df, 
                color='lightblue', 
                showmeans=True)
    plt.title("MEAN F1 (cv = 5)\n", size=14)

In [None]:
cv_df, acc = calculate_f1(models=models_list, cv=5)
f1_plot(cv_df)
acc

## ScatterText

In [None]:
scatter_df = pd.concat([train_df[:1500], train_df[-1500:]])
scatter_df = scatter_df.assign(
    parse=lambda scatter_df: scatter_df.content.apply(st.whitespace_nlp_with_sentences)
    )
scatter_df["label_text"] = "real"
scatter_df.loc[scatter_df["label"] == 0, "label_text"] = "satire"
scatter_df.drop(columns=["label"], inplace=True)
scatter_df.rename(columns={"label_text": "label"}, inplace=True)
# scatter_df.describe()

In [None]:
corpus = st.CorpusFromParsedDocuments(
    scatter_df,
    category_col="label",
    parsed_col="parse"
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

term_list = ["pick", "van gaal", "removed", "women", "provided", "impact", "king", 
             "worked", "woman", "legislation", "words", "head", "bush", "supposed",
             "costs", "flight", "majority", "using", "like", "added", "stated",
             "reportedly", "pictured", "continued", "asked", "press", "know",
             "little", "oh", "wednesday", "actually", "thing", "mean", "fact",
             "really", "sources", "finally", "thursday", "guess", "replied",
             "resident", "totally", "suddenly", "liberal", "baseball",
             "forget", "dinner", "cool", "unfortunately", "planet", "numerous", "dozen",
             "conference", "explain", "movies", "sure", "known"]

scatter_data = st.produce_scattertext_explorer(
    corpus,
    category='real', 
    category_name='Real', 
    not_category_name='Satire',
    minimum_term_frequency=0, 
    pmi_threshold_coefficient=0,
    return_data=True,
    transform=st.Scalers.dense_rank
)
scatter_plot_df = pd.DataFrame(scatter_data['data'])
scatter_plot_df['plot_term'] = scatter_plot_df['term'].apply(lambda x: x if x in term_list else " ")

scatterPlot = alt.Chart(scatter_plot_df).mark_circle().encode(
    x = alt.X('x:Q', title='\u21e6 less with satire | more with satire \u21e8', axis=alt.Axis(tickMinStep=0.1)),
    y = alt.Y('y:Q', title='\u21e6 less with real | more with real \u21e8', axis=alt.Axis(tickMinStep=0.1)),
    color = alt.Color('s', scale=alt.Scale(scheme='redyellowgreen', reverse=True),
                      legend=alt.Legend(title=["Real vs Satire", "frequency"])),
    tooltip=['term']
).properties(
    width = 500,
    height = 500
)

text = alt.Chart(scatter_plot_df).mark_text(
    align='left',
    baseline="middle",
    dx=7,
    color="black"
).encode(
    x=alt.X("x:Q"),
    y=alt.Y("y:Q"),
    text='plot_term'
).properties(
    title="How frequently are terms used with each type of news? Real vs. Satire",
    width=500,
    height=500
)

(scatterPlot + text).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_title(
    fontSize=15,
    anchor='start'
).configure_legend(
    titleFontSize=12
)

In [None]:
scatter_sub_plot_df = scatter_plot_df[(scatter_plot_df['x'] <= 0.2) & (scatter_plot_df['y'] <= 0.2)]

scatter_sub_plot = alt.Chart(scatter_sub_plot_df).mark_circle().encode(
    x = alt.X('x:Q', title='\u21e6 less with satire | more with satire \u21e8', axis=alt.Axis(tickMinStep=0.1)),
    y = alt.Y('y:Q', title='\u21e6 less with real | more with real \u21e8', axis=alt.Axis(tickMinStep=0.1)),
    color = alt.Color('s', scale=alt.Scale(scheme='redyellowgreen', reverse=True),
                      legend=alt.Legend(title=["Real vs Satire", "frequency"])),
    tooltip=['term']
).properties(
    width = 500,
    height = 500
).interactive()

text_sub = alt.Chart(scatter_sub_plot_df).mark_text(
    align='left',
    baseline='middle',
    dx=7,
    color='black'
).encode(
    x = alt.X('x:Q'),
    y = alt.Y('y:Q'),
    text='plot_term'
).properties(
    title = {'text':['How frequently are terms used with each type of news? Real vs Satire'],
             'subtitle': ['Frequency <= 0.2']},
    width = 500,
    height = 500
)

(scatter_sub_plot + text_sub).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_title(
    fontSize=15,
    subtitleFontSize=12,
    anchor='start'
).configure_legend(
    titleFontSize=12
)