# Models

Trying out word embeddings, and comparing them to one-hot embeddings for visualization and modeling

In [None]:
import os
import sys
import sklearn
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models.fasttext import FastText
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import BaggingClassifier

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
%config InlineBackend.figure_format = 'retina'
# from google.colab import drive
# drive.mount('/content/drive')

# Import Preprocessed Dataset

Data is generated from PREPROCESSING.ipynb

In [None]:
comments = pd.read_pickle("reddit_data/DATASET.pkl")
comments.head()

# Embed Words Into Vectors

In [None]:
def generate_FastText_embeddings(df, vector_size=100, window=3, min_count=1):
    rows = np.array([preprocess_string(row['body_text']) for i, row in df.iterrows()])
    # if you want to remove stopwords automatically, you can try the following:
    # rows = np.array([preprocess_string(remove_stopwords(row['Text'])) for row in df])
    # print(rows[0])  

    # train the model
    model = FastText(size=vector_size, window=window, min_count=min_count)
    model.build_vocab(rows)
    model.train(sentences=rows, total_examples=len(rows), epochs=10) 

    # average the vectors to get a vector that represents a whole comment
    vecs = np.zeros((len(rows), vector_size))

    for i in range(len(rows)):
        # some are empty
        if (rows[i] != []):
            vecs[i, :] = sum([model.wv.get_vector(word) for word in rows[i]]) / len(rows[i])

    return vecs

In [None]:
def generate_onehot_embeddings(df, max_features=2**12):
    vectorizer = TfidfVectorizer(max_features=max_features)
    vecs = vectorizer.fit_transform(df['body_text'].values)
    return vecs.toarray()

Experiment with different vector sizes. So far I've tried 100 and 16. I think 100 performs better. Also, if the vector size is small enough you can feed it directly into t-SNE instead of doing PCA.

In [None]:
X_fasttext = generate_FastText_embeddings(df=comments, vector_size=50, window=3, min_count=1)

In [None]:
X_onehot = generate_onehot_embeddings(df=comments, max_features=2**12)

# Compare Visualizations

For plotting, use PCA to get both datasets down to reasonable number of dimensions, then use t-SNE.

### Note for our report

The plots below should be able to give us a lot of material to include in the report, since we get a birdseye view of the two different subreddits. Try finding distinct clusters of points in the t-SNE plot, and hover over the comments to see what the similarities are.

An important thing to look for in the plots are the distinct clusters of points that are all the same color. From what I've seen so far, these represent comments that have certain words or small phrases that are unique to the "liberal" or "conservative" subreddit. For example, I noticed there are a bunch of comments in r/Conservative that mention "sjw" (social justice warrior), so you might find all those comments clumped together. 

These distinct words or phrases are probably a big part of what any machine learning model will use for predictions, so that is something to keep in mind when analyzing model performance as well. 

### Note on t-SNE

If you're not familiar with t-SNE, check out this link


https://distill.pub/2016/misread-tsne/



From my experience, the only parameter that really matters is "perplexity". I have no idea what it means, but on big datasets like ours it generally has the effect of changing the "clumpyness" of the datapoints. Lower values of perplexity tend to produce larger "clumps" of points, whereas higher perplexity gives lots of smaller clumps. 

In [None]:
pca = PCA(n_components=20) 

pca_result_onehot = pca.fit_transform(X_onehot)

In [None]:
tsne = TSNE(
    verbose=0, 
    n_components=2,
    perplexity=30, # good values are 10-50. 30 is default.
)

#### The following two code blocks might not work in Collab. And if they do it'll take like an hour. It works on my local machine though

In [None]:
X_fasttext_2D = tsne.fit_transform(X_fasttext)

In [None]:
X_onehot_2D = tsne.fit_transform(pca_result_onehot)

In [None]:
def format_df_for_plotting(df):
    df_formatted = df.copy(deep=True)
    df_formatted.body_text = df_formatted.body_text.str.wrap(30)
    df_formatted.body_text = df_formatted.body_text.apply(lambda x: x.replace('\n', '<br>'))
    return df_formatted

In [None]:
comments_formatted = format_df_for_plotting(df=comments)

In [None]:
fig = go.Figure(
    data=go.Scattergl(
        name="",
        x=X_fasttext_2D[:,0],
        y=X_fasttext_2D[:,1],
        # z=X_con_embedded[:,2], 
        mode='markers',
        marker=dict(
            size=4,
            opacity=0.7,
            color = comments_formatted["label"],
            #colorscale="jet"
        ),
        text=comments_formatted['body_text'],
        hovertemplate = "</br> %{text}",
    )
) 

fig.update_layout(
    title=dict(text='t-SNE r/Conservative and r/politics Comments', x=0.5, font_size=30),
    template="plotly_white",
    height=800,
    hoverlabel=dict(
        bgcolor="white",
        font_size=24,
    ),
    xaxis=dict(visible=False),
    yaxis=dict(visible=False),
)
fig.show()

In [None]:
fig = go.Figure(
    data=go.Scattergl( 
        name="",
        x=X_onehot_2D[:,0],
        y=X_onehot_2D[:,1],
        mode='markers',
        marker=dict(
            size=4, 
            opacity=0.7,
            color = comments_formatted["label"],
            #colorscale="jet"
        ),
        text=comments_formatted['body_text'],
        hovertemplate = "</br> %{text}",
    ),
) 

fig.update_layout(
    title=dict(text='t-SNE r/Conservative and r/politics Comments', x=0.5, font_size=30),
    template="plotly_white",
    height=800,
    hoverlabel=dict(
        bgcolor="white",
        font_size=24,
    ),
    xaxis=dict(visible=False),
    yaxis=dict(visible=False),
)
fig.show()

# Compare Model Performance

In [None]:
labels = np.array(comments["label"].tolist())

X_ft_train, X_ft_valid, y_ft_train, y_ft_valid = train_test_split(X_fasttext, labels, test_size=0.2, random_state=42)

X_oh_train, X_oh_valid, y_oh_train, y_oh_valid = train_test_split(X_onehot, labels, test_size=0.2, random_state=42)

In [None]:
fasttext_models = {
    "Gaussian Naive Bayes": GaussianNB(),
    #"Decision Tree": DecisionTreeClassifier(),
    #"Bagging Ensemble": BaggingClassifier(),
    #"KNN": KNeighborsClassifier(),
    #"SVM": SVC(),
}

In [None]:
onehot_models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    #"Decision Tree": DecisionTreeClassifier(),
    #"Bagging Ensemble": BaggingClassifier(),
    #"KNN": KNeighborsClassifier(), 
    #"SVM": SVC(), takes at least 2 hours.
}

### Model Results Using FastText Embeddings

In [None]:
for key in fasttext_models:
    print(key)
    model = fasttext_models[key]
    model.fit(X_ft_train, y_ft_train)
    y_pred = model.predict(X_ft_valid)
    print('Accuracy score: {:.4f}'.format(accuracy_score(y_ft_valid, y_pred)))
    print('Precision score: {:.4f}'.format(precision_score(y_ft_valid, y_pred)))
    print('Recall score: {:.4f}'.format(recall_score(y_ft_valid, y_pred)))
    print('F1 score: {:.4f} \n'.format(f1_score(y_ft_valid, y_pred)))

### Model Results Using Tf-Idf OneHot Embeddings

In [None]:
for key in onehot_models:
    print(key)
    model = onehot_models[key]
    model.fit(X_oh_train, y_oh_train)
    y_pred = model.predict(X_oh_valid)
    print('Accuracy score: {:.4f}'.format(accuracy_score(y_oh_valid, y_pred)))
    print('Precision score: {:.4f}'.format(precision_score(y_oh_valid, y_pred)))
    print('Recall score: {:.4f}'.format(recall_score(y_oh_valid, y_pred)))
    print('F1 score: {:.4f} \n'.format(f1_score(y_oh_valid, y_pred)))