# Part 2: Preprocessing & Modeling

## Imports

In [None]:
import nltk
import pandas                        as pd
import numpy                         as np
import seaborn                       as sns
import matplotlib.pyplot             as plt
from sklearn.ensemble                import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model            import LogisticRegression
from sklearn.metrics                 import confusion_matrix
from sklearn.metrics                 import roc_auc_score
from sklearn.metrics                 import accuracy_score
from sklearn.metrics                 import precision_score
from sklearn.metrics                 import recall_score
from sklearn.metrics                 import f1_score
from sklearn.metrics                 import balanced_accuracy_score
from sklearn.model_selection         import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline                import Pipeline
from sklearn.tree                    import DecisionTreeClassifier
from sklearn.svm                     import SVC
from nltk.corpus                     import stopwords
from nltk.stem                       import WordNetLemmatizer
from nltk.tokenize                   import RegexpTokenizer 
from xgboost                         import XGBClassifier
from IPython.core.display            import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
sns.set(style = "white", palette = "deep")
%matplotlib inline

## Table Of Contents

-------

1. [Reading In The Data](#Reading-In-The-Data)
    - [Overview](#Overview)
    - [Visuals](#Visuals)
    
    
2. [Lemmatizing](#Lemmatizing)


3. [Establishing The Baseline](#Establishing-The-Baseline)


4. [Modeling](#Modeling)
    - [Setting The X and y variables](#Setting-The-X-and-y-variables)
    - [Running A Train-Test Split](#Running-A-Train-Test-Split)
    - [Evaluation Formulae](#Evaluation-Formulae)
    - [Logistic Regression](#Logistic-Regression)
    - [Random Forest Classifier](#Random-Forest-Classifier)
    - [Support Vector Classifier](#Support-Vector-Classifier)
    - [XGboost Classifier](#XGBoost-Classifier)
    
    
5. [Model Evaluation](#Model-Evaluation)

## Reading In The Data

### Overview

In [None]:
model_data = pd.read_csv("../Data/model_data.csv")

In [None]:
# Checking the data's head

model_data.head()

In [None]:
# Checking for null values

model_data.isnull().sum()

In [None]:
# Checking data types

model_data.info()

### Visuals

#### Functions

In [None]:
def plot_text_length_dist(text_list):
    plt.figure(figsize = (18,6))
    sns.distplot(text_list, kde = False, color = "black",
                 bins = 60)
    plt.title(f"Distribution Of Text Lengths", size = 18)
    plt.xlabel("Length", size = 16)
    plt.ylabel("Frequency", size = 16)
    plt.xticks(np.arange(0,23500,1500), size = 14)
    plt.yticks(size = 14)
    plt.tight_layout()
    plt.show();

In [None]:
def plot_most_frequent_authors(df, col):
    
    plt.figure(figsize = (20,6))
    sns.barplot(x = df.index,
                y = col,
                data = df)
    plt.title("Most Common Posters", size = 18)
    plt.xlabel("Reddit User", size = 16)
    plt.ylabel("Number Of Posts", size = 16)
    plt.xticks(size = 13)
    plt.yticks(size = 14);

#### Text Length

In [None]:
# Generating a list of text lengths

length_list = [len(text) for text in model_data["text"]]

plot_text_length_dist(length_list)

Most of the posts are relatively short (<2000 words), but there are a few that are extremely long (>20,000 words.)  We expected that most posts would be less than a few thousand words, which is true for the majority.

####  Most Frequent Authors

In [None]:
author_count = pd.DataFrame(model_data["author"].value_counts().head(10))

plot_most_frequent_authors(df  = author_count, 
                           col = "author")

We did not really know what to expect when we plotted this graph, because it is generally the case that a few users post most frequently and most barely post at all.  We would have like to look at the number of comments by each user in both subreddits as a measure of activity, but that is beyond the scope of this project.

#### Subreddit Of Origin

In [None]:
tick_labels = ["r/Cooking", "r/AskCulinary"]


plt.figure(figsize = (10,5))
sns.countplot(model_data["source"])
plt.title("Post Origin", size = 18)
plt.xlabel("Source", size = 16)
plt.ylabel("Number Of Posts", size = 16)
plt.xticks(np.arange(0,2,1), 
           labels = tick_labels, 
           size = 14)
plt.yticks(size = 14);

We were a little surprised that there are more r/AskCulinary posts because we had roughly equal numbers of pulls from each subreddit.

#### Visualizing Most Common Words

Before we start modeling, we need to know what the most frequent words are in each subreddit are because it might be harder for our model to predict with those words in the dataframe.

We will subset the data frame into posts from r/Cooking and r/AskCulinary and use count vectorizer to determine the most frequent words.  We will also remove stop words from the outset.

In [None]:
def plot_most_frequent_words(dataframes, titles):
    count = 0
    fig   = plt.figure(figsize = (24,20))
    for d, dataframe in enumerate(dataframes):
        count += 1
        ax    = fig.add_subplot(2, 2, count)
        sns.barplot(x       = 0,
                    y       = dataframe.index,
                    data    = dataframe,
                    palette = "deep")
        plt.title(f"Most Common Words From {titles[d]}", size = 20)
        plt.xlabel("Number Of Occurences", size = 18)
        plt.ylabel("Word", size = 18)
        plt.xticks(size = 16)
        plt.yticks(size = 16)

In [None]:
# Instantiating the count vectorizer

vectorizer = CountVectorizer()

# Masking the vectorizer with English stop words

cvec_cooking     = CountVectorizer(stop_words = "english")
cvec_askculinary = CountVectorizer(stop_words = "english")

# Subsetting the dataframe

cooking     = model_data[model_data["target"] == 1]
askculinary = model_data[model_data["target"] == 0]

# Fit-transforming the vectorizer

vec_cooking     = cvec_cooking.fit_transform(cooking["text"])
vec_askculinary = cvec_askculinary.fit_transform(askculinary["text"])

In [None]:
# Saving the vectorized dfs to a new dataframe

cooking_vectorized = pd.DataFrame(vec_cooking.toarray(), 
                                  columns = cvec_cooking.get_feature_names())

askculinary_vectorized = pd.DataFrame(vec_askculinary.toarray(), 
                                      columns = cvec_askculinary.get_feature_names())

# Getting the 15 most frequent words from each

vectorized_cooking     = pd.DataFrame(cooking_vectorized.sum().sort_values(ascending = False).head(15))
vectorized_askculinary = pd.DataFrame(askculinary_vectorized.sum().sort_values(ascending = False).head(15))

# Plotting the most common words

plot_most_frequent_words(dataframes = [vectorized_cooking, vectorized_askculinary],
                         titles     = ["r/Cooking", "r/AskCulinary"])

We can see that there are a lot of words that occur in both subreddits.  We decided that because of that, we should create a list of customized stop words.  Furthermore, we noticed that we have to lemmatize or stem the text columns because of there are multiple forms of words in the most frequent words such as 'make' & 'making' or 'recipe' and recipes.

In [None]:
# Downloading the default stopwords

nltk.download("stopwords");

# Adding our stopwords to the English set

new_stopwords = ["like", "just", "make", "cook",
                 "use", "chicken", "recipe", "sauce"]

stopwords     = stopwords.words('english')

stopwords.extend(new_stopwords)

## Lemmatizing

We felt that lemmatizing is a better option than stemming because the lemma form of a word is more likely to result in an actual word of English than trying to find a word's stem: there are so many irregularities in English that it is not always easy to find the stem.

In [None]:
# Instantiating the lemmatizier and tokenizer
# The tokenizer will only keep text

lemmatizer = WordNetLemmatizer()
tokenizer  = RegexpTokenizer(r'\w+')

# Setting up the lemmatizer

lemmatized_posts = []

for post in model_data["text"]:
    tokens = tokenizer.tokenize(post)
    post  = [lemmatizer.lemmatize(post) for post in tokens]
    lemmatized_posts.append(" ".join(post))
    
# Appending the lemmatized posts to the dataframe

model_data["lemmatized_text"] = lemmatized_posts

# Checking the head of the dataframe

model_data.head()

While checking the results from the cell above, we noticed that in `lemmatized_text` there are some URLs which need to be removed.  We used a regular expression to remove all URLs.

In [None]:
model_data["lemmatized_text"] = model_data["lemmatized_text"].str.replace("http\S+", "")

## Establishing The Baseline

A baseline in classification gives us an idea of how exactly the model is performing.  The baseline is simply the percentage of occurrences of our target in the data as a whole.  In this case it will be what percentage of posts are from r/Cooking.

If our model has an accuracy of >41.44% we know that it is better than simply guessing the class of a post.

In [None]:
round(model_data["target"].value_counts(normalize = True)*100, 2)

## Modeling

Now that our text is in the format we want, we can begin the process of modeling.

There are a few steps we have to do before we start running models: we have to define the X and y variables and run a train-test split on the data.

### Setting The X and y variables

In [None]:
X = model_data["lemmatized_text"]
y = model_data["target"]

### Running A Train-Test Split

A train-test split is important because it allows us to reserve a portion of our data for test so that the model does not see all data before predicting.  In this case we want to preserve the class split, so we will stratify the data to match the distribution of the classes.

In [None]:
# The stratify argument preserves the distribution of classes

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state = 42,
                                                    stratify     = y)

Each of the three models we will use will be gridsearched so that we can experiment with different combinations of hyperparameters (parameters we have to define).  Additionally, each model will be fit with a count vectorizer and a TFIDF (Term Frequency-Inverse Document Frequency) vectorizer.

### Evaluation Formulae

A confusion matrix allows us to look at how our model actually classified our data.  It plots the true y values and the predicted y values so that we can have an idea of how the model performs with each class.

In [None]:
# Creating a confusion matrix
# We convert the confusion matrix to a dataframe
# to make it easier to read

def create_confusion_matrix(y, y_preds):
    cm     = confusion_matrix(y, y_preds)
    matrix = pd.DataFrame(cm, 
                          columns = ["Predicted r/Cooking", "Predicted r/AskCulinary"], 
                          index = ["Actual r/Cooking", "Actual r/AskCulinary"])
    return matrix

One of the evaluation metrics we want to use is specificty which is not a function we can import from sklearn.  In order to calculate this score, we will create a function based off of the confusion matrices.

In [None]:
# The function creates a confusion matrix
# and then calculates the specifity from
# specific cells

def calc_specificity(y, y_hat):
    cm          = confusion_matrix(y, y_hat)
    specificity = cm[1,1] / (cm[0,1] + cm[1, 0])
    return specificity

For each model we will calculate an ROC-AUC score.  The ROC (receiver operating characteristic) shows us a binary classification model's ability to distinguish between two classes.  The curve, which will be plotted for our best model, shows us the distribution of the two classes.  The AUC (area under the curve) is how we actually measure the distribution of the classes: 0.5 is the lowest possible and 1.0 is the highest.

This image from [GreyAtom](https://medium.com/greyatom/lets-learn-about-auc-roc-curve-4a94b4d88152) illustrates the AUC-ROC well:

<img src = "../Images/ROC_AUC 0.8 0.9.png" alt = "high auc_roc scores" height = "350" width = "350">

<img src = "../Images/ROC_AUC 0.5 0.7.png" alt = "low auc_roc scores" height = "350" width = "350">

Accuracy is not the most informative metric because we want to know how well the model is actually performing on both classes.  In order to do that we decided to look at four metrics in addition to general accuracy.  The four extra are:

- **Balanced Accuracy**: the average of the recall on each class


- **Precision**: how many positives are actually correct


- **Sensitivity**: how many negatives are actually correct (also known as recall)


- **F1 Score**: a measure of accuracy that takes into account the precision and recall

In [None]:
# Generating the "classification" report

def generate_model_eval(y, y_hat):
    print(f"The accuracy score is         : {round(accuracy_score(y, y_hat), 5)}")
    print(f"The balanced accuracy score is: {round(balanced_accuracy_score(y, y_hat), 5)}")
    print(f"The specificity score is      : {round(calc_specificity(y, y_hat), 5)}")
    print(f"The recall score is           : {round(recall_score(y, y_hat), 5)}")
    print(f"The F1 score is               : {round(f1_score(y, y_hat), 5)}")
    print(f"The ROC-AUC score is          : {round(roc_auc_score(y, y_hat), 5)}")

### Logistic Regression

The logistic regression is very similar to the linear regression, but it uses a logit function to bend the line so that it can predict either 0 or 1.


The gridsearch will be searching hyperparameters for the count vectorizer and the TFIDF, not the logistic regression.

#### Count Vectorizer

In [None]:
# Setting up the pipeline
# The model's best parameters are shown

cvec_lr_pipe = Pipeline([("cvec", CountVectorizer()), 
                         ("log_reg", LogisticRegression())])

# Setting the pipeline hyperparameters

cvec_pipe_params = {"cvec__max_features": [125], 
                    "cvec__ngram_range" : [(1,2)], 
                    "cvec__stop_words"  : [None]}

# Instantiating the grid search

cvec_lr_gs = GridSearchCV(cvec_lr_pipe, 
                          param_grid = cvec_pipe_params, 
                          cv         = 5)

# Fitting the training data to the pipeline model

cvec_lr_gs.fit(X_train, y_train);

In [None]:
# Generating training predictions

cvec_lr_train_preds = cvec_lr_gs.predict(X_train)

# Generating testing predictions

cvec_lr_preds       = cvec_lr_gs.predict(X_test) 

In [None]:
# Training metrics

generate_model_eval(y_train, cvec_lr_train_preds)

In [None]:
# Test metrics

generate_model_eval(y_test, cvec_lr_preds)

#### TFIDF Vectorization

In [None]:
# Setting up the pipeline
# The model's best parameters are shown

tvec_lr_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                         ("log_reg", LogisticRegression())])

# Setting TFIDF pipe parameters

tvec_pipe_params = {"tvec__max_features": [650], 
                    "tvec__ngram_range" : [(1,1)], 
                    "tvec__stop_words"  : [None]}
                    
# Instantiating the grid search

tvec_lr_gs = GridSearchCV(tvec_lr_pipe, 
                          param_grid = tvec_pipe_params, 
                          cv         = 5)

# Fitting the training data to the pipeline model

tvec_lr_gs.fit(X_train, y_train);

In [None]:
# Generating training predictions

tvec_lr_train_preds = tvec_lr_gs.predict(X_train)

# Generating testing predictions

tvec_lr_preds       = tvec_lr_gs.predict(X_test) 

In [None]:
# Training metrics

generate_model_eval(y_train, tvec_lr_train_preds)

In [None]:
# Test metrics

generate_model_eval(y_test, tvec_lr_preds)

### Support Vector Classifier

A support vector machine (in this case a classifier) is at its core a linear model.  However, instead of running like a logistic regression, it seeks to linearly separate the data.  To do that, it uses a kernel to raise the data into _n_-dimensional space.  It then uses a line, plane (3 dimensional line), or hyperplane (>3 dimensional line) to delineate the data.

#### Count Vectorizer

In [None]:
# Setting up the pipeline
# The model's best parameters are shown

cvec_svc_pipe = Pipeline([("cvec", CountVectorizer()), 
                         ("svc", SVC())])

# Setting TFIDF pipe parameters

cvec_pipe_params = {"cvec__max_features": [319], 
                    "cvec__ngram_range" : [(1,2)], 
                    "cvec__stop_words"  : [None],
                    "svc__C"            : [1.0],
                    "svc__kernel"       : ["rbf"],
                    "svc__gamma"        : ["auto"]}
                    
# Instantiating the grid search

cvec_svc_gs = GridSearchCV(cvec_svc_pipe, 
                           param_grid = cvec_pipe_params, 
                           cv         = 5)

# Fitting the training data to the pipeline model

cvec_svc_gs.fit(X_train, y_train);

In [None]:
# Generating training predictions

cvec_svc_train_preds = cvec_svc_gs.predict(X_train)

# Generating testing predictions

cvec_svc_preds       = cvec_svc_gs.predict(X_test) 

In [None]:
# Training metrics

generate_model_eval(y_train, cvec_svc_train_preds)

In [None]:
# Test metrics

generate_model_eval(y_test, cvec_svc_preds)

#### TFIDF Vectorizer

In [None]:
# Setting up the pipeline
# The model's best parameters are shown

tvec_svc_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                         ("svc", SVC())])

# Setting TFIDF pipe parameters

tvec_pipe_params = {"tvec__max_features": [1], 
                    "tvec__ngram_range" : [(1,1)], 
                    "tvec__stop_words"  : [None],
                    "svc__C"            : [1.0],
                    "svc__kernel"       : ["rbf"],
                    "svc__gamma"        : ["auto"]}
                    
# Instantiating the grid search

tvec_svc_gs = GridSearchCV(tvec_svc_pipe, 
                           param_grid = tvec_pipe_params, 
                           cv         = 5)

# Fitting the training data to the pipeline model

tvec_svc_gs.fit(X_train, y_train);

In [None]:
# Generating training predictions

tvec_svc_train_preds = tvec_svc_gs.predict(X_train)

# Generating testing predictions

tvec_svc_preds       = tvec_svc_gs.predict(X_test) 

In [None]:
# Training metrics

generate_model_eval(y_train, tvec_svc_train_preds)

In [None]:
# Test metrics

generate_model_eval(y_test, tvec_svc_preds)

### Random Forest Classifier

A random forest classifier is a decision tree based classification method.  However, it has advantages over other tree based models.  Firstly, it bootstraps the dataframe to have a random subset of the data, but it also takes a random subset of the features.  Having two levels of randomness in the model reduce the likelihood of the model being overfit on training data but it also allows the model to be less prone to variance caused by a large number of features.

#### Count Vectorizer

In [None]:
# Creating the pipeline
# The model's best parameters are shown

cvec_rf_pipe = Pipeline([("cvec", CountVectorizer()), 
                         ("rf", RandomForestClassifier(random_state = 42))])

# Setting the pipeline hyperparameters

cvec_pipe_params = {"cvec__max_features"   : [1000], 
                    "cvec__ngram_range"    : [(1,1)], 
                    "cvec__stop_words"     : [None],
                    "rf__n_estimators"     : [72],
                    "rf__min_samples_split": [6],
                    "rf__min_samples_leaf" : [2],
                    "rf__max_depth"        : [20]}

# Instantiating the grid search

cvec_rf_gs = GridSearchCV(cvec_rf_pipe, 
                          param_grid = cvec_pipe_params, 
                          cv         = 5,
                          n_jobs     = 6)

# Fitting the model to the testing data

cvec_rf_gs.fit(X_train, y_train);

In [None]:
# Generating training predictions

cvec_rf_train_preds = cvec_rf_gs.predict(X_train)

# Generating testing predictions

cvec_rf_preds       = cvec_rf_gs.predict(X_test) 

In [None]:
# Training metrics

generate_model_eval(y_train, cvec_rf_train_preds)

In [None]:
# Test metrics

generate_model_eval(y_test, cvec_rf_preds)

#### TFIDF Vectorizer

In [None]:
# Creating the pipeline
# The model's best parameters are shown

tvec_rf_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                         ("rf", RandomForestClassifier(random_state = 42))])

# Setting the pipeline hyperparameters

tvec_pipe_params = {"tvec__max_features"   : [250], 
                    "tvec__ngram_range"    : [(1,2)], 
                    "tvec__stop_words"     : [None],
                    "rf__n_estimators"     : [30],
                    "rf__min_samples_split": [6],
                    "rf__min_samples_leaf" : [2],
                    "rf__max_depth"        : [12]}

# Instantiating the grid search

tvec_rf_gs = GridSearchCV(tvec_rf_pipe, 
                          param_grid = tvec_pipe_params, 
                          cv         = 5,
                          n_jobs     = 6)

# Fitting the model to the testing data

tvec_rf_gs.fit(X_train, y_train);

In [None]:
# Generating training predictions

tvec_rf_train_preds = tvec_rf_gs.predict(X_train)

# Generating testing predictions

tvec_rf_preds       = tvec_rf_gs.predict(X_test) 

In [None]:
# Training metrics

generate_model_eval(y_train, tvec_rf_train_preds)

In [None]:
# Test metrics

generate_model_eval(y_test, tvec_rf_preds)

### XGBoost Classifier

XGBoost is a tree-based boosting model that iteratively fits tree models on the errors of the previous model and uses gradient descent to help minimize the loss function.  Furthermore, the XGBoost is much more computationally effecient and can be parallelized unlike orther boosting models.

#### Count Vectorizer

In [None]:
# Creating the pipeline
# The model's best parameters are shown

cvec_xgbc_pipe = Pipeline([("cvec", CountVectorizer()), 
                           ("xgbc", XGBClassifier(n_jobs                = 6,
                                                  early_stopping_rounds = 10))])

# Setting the pipeline hyperparameters

cvec_pipe_params = {"cvec__max_features"   : [200], 
                    "cvec__ngram_range"    : [(1,3)], 
                    "cvec__stop_words"     : [None],
                    "xgbc__max_depth"      : [3],
                    "xgbc__learning_rate"  : [0.04],
                    "xgbc__n_estimators"   : [175],
                    "xgbc__gamma"          : [3.0]}

# Instantiating the grid search

cvec_xgbc_gs = GridSearchCV(cvec_xgbc_pipe, 
                            param_grid = cvec_pipe_params, 
                            cv         = 5,
                            n_jobs     = 6)

# Fitting the model to the testing data

cvec_xgbc_gs.fit(X_train, y_train);

In [None]:
# Generating training predictions

cvec_xgbc_train_preds = cvec_xgbc_gs.predict(X_train)

# Generating testing predictions

cvec_xgbc_preds       = cvec_xgbc_gs.predict(X_test) 

In [None]:
# Training metrics

generate_model_eval(y_train, cvec_xgbc_train_preds)

In [None]:
# Test metrics

generate_model_eval(y_test, cvec_xgbc_preds)

#### TFIDF Vectorizer

In [None]:
# Creating the pipeline
# The model's best parameters are shown

tvec_xgbc_pipe = Pipeline([("tvec", TfidfVectorizer()), 
                           ("xgbc", XGBClassifier(n_jobs                = 6,
                                                  seed                  = 42,
                                                  early_stopping_rounds = 10))])

# Setting the pipeline hyperparameters

tvec_pipe_params = {"tvec__max_features"   : [525], 
                    "tvec__ngram_range"    : [(1,3)], 
                    "tvec__stop_words"     : [stopwords],
                    "xgbc__max_depth"      : [3],
                    "xgbc__learning_rate"  : [0.25],
                    "xgbc__n_estimators"   : [139],
                    "xgbc__gamma"          : [1.0]}

# Instantiating the grid search

tvec_xgbc_gs = GridSearchCV(tvec_xgbc_pipe, 
                            param_grid = tvec_pipe_params, 
                            cv         = 5,
                            n_jobs     = 6)

# Fitting the model to the testing data

tvec_xgbc_gs.fit(X_train, y_train);

In [None]:
# Generating training predictions

tvec_xgbc_train_preds = tvec_xgbc_gs.predict(X_train)

# Generating testing predictions

tvec_xgbc_preds       = tvec_xgbc_gs.predict(X_test) 

In [None]:
# Training metrics

generate_model_eval(y_train, tvec_xgbc_train_preds)

In [None]:
# Training metrics

generate_model_eval(y_test, tvec_xgbc_preds)

## Model Evaluation

### Dataframes

#### Prediction Dataframes

In [None]:
# Count vectorizer predictions

cvec_accuracy          = [0.62995, 0.65718, 0.65718, 0.64975]
cvec_balanced_accuracy = [0.59815, 0.61139, 0.60094, 0.60287]
cvec_specificity       = [0.46154, 0.41516, 0.32852, 0.38869]
cvec_sensitivity       = [0.41194, 0.34328, 0.27164, 0.32836]
cvec_f1_score          = [0.48, 0.45365, 0.39651, 0.43738]
cvec_rocauc_score      = [0.59815, 0.61139, 0.60094, 0.60287]

# TFIDF vectorizer predictions

tvec_accuracy          = [0.67203, 0.60767, 0.64356, 0.64109]
tvec_balanced_accuracy = [0.6367, 0.55082, 0.58931, 0.61376]
tvec_specificity       = [0.5434, 0.23028, 0.31597, 0.52414]
tvec_sensitivity       = [0.42985, 0.21791, 0.27164, 0.45373]
tvec_f1_score          = [0.5208, 0.31533, 0.38723, 0.51178]
tvec_rocauc_score      = [0.6367, 0.55082, 0.58931, 0.61376]

In [None]:
cvec_scores = pd.DataFrame(data    = [cvec_accuracy, cvec_balanced_accuracy, cvec_specificity,
                                      cvec_sensitivity, cvec_f1_score, cvec_rocauc_score],
                           columns = ["Log. Reg.", "SVC", "Random Forest", "XGBoost"],
                           index   = ["Accuracy", "Balanced Accuracy", "Specificity",
                                      "Sensitivity", "F1 Score", "ROC-AUC Score"]).T

tvec_scores = pd.DataFrame(data    = [tvec_accuracy, tvec_balanced_accuracy, tvec_specificity,
                                      tvec_sensitivity, tvec_f1_score, tvec_rocauc_score],
                           columns = ["Log. Reg.", "SVC", "Random Forest", "XGBoost"],
                           index   = ["Accuracy", "Balanced Accuracy", "Specificity",
                                      "Sensitivity", "F1 Score", "ROC-AUC Score"]).T

In [None]:
cvec_scores

In [None]:
tvec_scores