In [25]:
# Package Imports

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import pandas as pd
import ast
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from imblearn.over_sampling import SMOTE

In [26]:
# Read in Data
df_reddit = pd.read_csv("/Users/lukestephens/Downloads/reddit_data.csv", engine='python', on_bad_lines='skip')
df_kaggle = pd.read_csv("/Users/lukestephens/Downloads/kaggle_data.csv", engine='python', on_bad_lines='skip')

In [3]:
# Create results dataframe for analysis
df_results = pd.DataFrame(columns=[
    'model', 
    'vectorizer', 
    'dataset', 
    'accuracy', 
    'precision', 
    'recall', 
    'f1_score'
])

First, we will examine model performance using both TF-IDF and CountVec vectorizers, in conjunction with Logistic Regression and Naive Bayes models. These tests will be performed both within datasets and across datasets for training and testing.

# LR Models

### TF-IDF

In [4]:
# Train & test on Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred_lr))

df_results.loc[0] = ['LogisticRegression', 'TF-IDF', 'Reddit', accuracy, precision, recall, f1]



Reddit Accuracy: 0.31921143480104175
Reddit Precision: 0.3161585494559101
Reddit Recall: 0.31921143480104175
Reddit F1-Score: 0.27030098958675136
              precision    recall  f1-score   support

           0       0.44      0.02      0.03      4146
           1       0.28      0.04      0.07     19707
           2       0.34      0.01      0.02      8606
           3       0.30      0.13      0.18     38709
           4       0.00      0.00      0.00       515
           5       0.14      0.00      0.00      1530
           6       0.29      0.01      0.01       866
           7       0.45      0.01      0.02      2567
           8       0.29      0.20      0.24     38870
           9       0.28      0.15      0.20     35492
          10       0.31      0.34      0.33     71610
          11       0.33      0.68      0.44     90657
          12       0.29      0.00      0.00      1369
          13       0.51      0.02      0.05      2252
          14       0.53      0.01      0.01

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [5]:
# Train & test on Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['type']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Kaggle Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Kaggle Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Kaggle Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Kaggle F1-Score: {f1}")

print(classification_report(y_test, y_pred_lr))

df_results.loc[1] = ['LogisticRegression', 'TF-IDF', 'Kaggle', accuracy, precision, recall, f1]



Kaggle Accuracy: 0.23812710657921954
Kaggle Precision: 0.24464319590837802
Kaggle Recall: 0.23812710657921954
Kaggle F1-Score: 0.19192041644931035
              precision    recall  f1-score   support

           0       0.18      0.00      0.00      1831
           1       0.21      0.04      0.07      6380
           2       0.40      0.01      0.01      2192
           3       0.19      0.05      0.08      6585
           4       0.00      0.00      0.00       398
           5       0.50      0.00      0.00       442
           6       0.57      0.01      0.02       379
           7       1.00      0.00      0.00       874
           8       0.23      0.29      0.25     13902
           9       0.26      0.57      0.35     17383
          10       0.21      0.16      0.18     10380
          11       0.23      0.27      0.25     12339
          12       0.11      0.00      0.00      1546
          13       0.34      0.01      0.02      2472
          14       0.40      0.01      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [6]:
# Train on Reddit, test on Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_reddit['body'])

y_train = df_reddit['class']

X_test = vectorizer.transform(df_kaggle['posts'])

y_test = df_kaggle['type']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test_encoded, y_pred_lr))

df_results.loc[2] = ['LogisticRegression', 'TF-IDF', 'Train Reddit, Test Kaggle', accuracy, precision, recall, f1]



Reddit Accuracy: 0.18167504228368397
Reddit Precision: 0.19207675574327213
Reddit Recall: 0.18167504228368397
Reddit F1-Score: 0.13783694631461985
              precision    recall  f1-score   support

           0       0.05      0.00      0.00      9104
           1       0.18      0.02      0.04     32083
           2       0.09      0.00      0.00     10988
           3       0.13      0.07      0.09     33024
           4       0.00      0.00      0.00      1986
           5       0.00      0.00      0.00      2141
           6       0.00      0.00      0.00      1880
           7       0.03      0.00      0.00      4238
           8       0.25      0.11      0.15     69990
           9       0.32      0.12      0.17     86959
          10       0.15      0.26      0.19     51129
          11       0.17      0.65      0.27     61438
          12       0.00      0.00      0.00      7886
          13       0.02      0.00      0.00     12460
          14       0.07      0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [7]:
# Train on Kaggle, test on Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_kaggle['posts'])

y_train = df_kaggle['type']

X_test = vectorizer.transform(df_reddit['body'])

y_test = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test_encoded, y_pred_lr))

df_results.loc[3] = ['LogisticRegression', 'TF-IDF', 'Train Kaggle, Test Reddit', accuracy, precision, recall, f1]



Reddit Accuracy: 0.2092399006722791
Reddit Precision: 0.23223226863217858
Reddit Recall: 0.2092399006722791


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Reddit F1-Score: 0.1879682397877797


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

           0       0.04      0.00      0.00     20936
           1       0.13      0.03      0.05     97835
           2       0.10      0.00      0.00     43642
           3       0.21      0.05      0.08    194338
           4       0.00      0.00      0.00      2651
           5       0.00      0.00      0.00      7483
           6       0.00      0.00      0.00      4477
           7       0.00      0.00      0.00     12793
           8       0.17      0.30      0.22    194680
           9       0.14      0.54      0.22    176991
          10       0.29      0.14      0.18    358042
          11       0.34      0.29      0.31    452235
          12       0.06      0.00      0.01      7002
          13       0.03      0.00      0.00     11345
          14       0.02      0.00      0.00     16590
          15       0.11      0.01      0.02     50060

    accuracy                           0.21   1651100
   macro avg       0.10   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### CountVec

In [8]:
# Train & test on Reddit

vectorizer = CountVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range=(1, 1))

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred_lr))

df_results.loc[4] = ['LogisticRegression', 'CountVec', 'Reddit', accuracy, precision, recall, f1]



Reddit Accuracy: 0.31497183695718006
Reddit Precision: 0.3212123432636192
Reddit Recall: 0.31497183695718006
Reddit F1-Score: 0.24238728284882308
              precision    recall  f1-score   support

           0       0.46      0.01      0.03      4146
           1       0.30      0.03      0.05     19707
           2       0.24      0.01      0.01      8606
           3       0.33      0.09      0.15     38709
           4       0.00      0.00      0.00       515
           5       0.08      0.00      0.00      1530
           6       0.25      0.01      0.02       866
           7       0.42      0.01      0.02      2567
           8       0.34      0.13      0.19     38870
           9       0.31      0.09      0.14     35492
          10       0.33      0.23      0.27     71610
          11       0.31      0.82      0.45     90657
          12       0.08      0.00      0.00      1369
          13       0.32      0.01      0.01      2252
          14       0.45      0.01      0.02

In [9]:
# Train & test on Kaggle

vectorizer = CountVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range=(1, 1))

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['type']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Kaggle Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Kaggle Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Kaggle Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Kaggle F1-Score: {f1}")

print(classification_report(y_test, y_pred_lr))

df_results.loc[5] = ['LogisticRegression', 'CountVec', 'Kaggle', accuracy, precision, recall, f1]



Kaggle Accuracy: 0.22932966671939453
Kaggle Precision: 0.2029003045259884
Kaggle Recall: 0.22932966671939453
Kaggle F1-Score: 0.1949479073086308
              precision    recall  f1-score   support

           0       0.11      0.01      0.02      1831
           1       0.18      0.08      0.11      6380
           2       0.11      0.02      0.03      2192
           3       0.17      0.07      0.10      6585
           4       0.07      0.01      0.01       398
           5       0.20      0.01      0.02       442
           6       0.22      0.02      0.04       379
           7       0.06      0.00      0.01       874
           8       0.23      0.26      0.25     13902
           9       0.25      0.53      0.34     17383
          10       0.21      0.16      0.18     10380
          11       0.22      0.25      0.23     12339
          12       0.13      0.02      0.03      1546
          13       0.13      0.02      0.03      2472
          14       0.17      0.03      0.05 

In [10]:
# Train on Reddit, test on Kaggle

vectorizer = CountVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range=(1, 1))

X_train = vectorizer.fit_transform(df_reddit['body'])

y_train = df_reddit['class']

X_test = vectorizer.transform(df_kaggle['posts'])

y_test = df_kaggle['type']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test_encoded, y_pred_lr))

df_results.loc[6] = ['LogisticRegression', 'CountVec', 'Train Reddit, Test Kaggle', accuracy, precision, recall, f1]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Reddit Accuracy: 0.16018641324848204
Reddit Precision: 0.21786613439555413
Reddit Recall: 0.16018641324848204
Reddit F1-Score: 0.07957339969964851
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      9104
           1       0.25      0.00      0.00     32083
           2       0.00      0.00      0.00     10988
           3       0.11      0.02      0.04     33024
           4       0.00      0.00      0.00      1986
           5       0.00      0.00      0.00      2141
           6       0.00      0.00      0.00      1880
           7       0.00      0.00      0.00      4238
           8       0.26      0.03      0.06     69990
           9       0.34      0.03      0.05     86959
          10       0.15      0.12      0.13     51129
          11       0.16      0.88      0.27     61438
          12       1.00      0.00      0.00      7886
          13       0.43      0.00      0.00     12460
          14       0.00      0.00      0.0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [11]:
# Train on Kaggle, test on Reddit

vectorizer = CountVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range=(1, 1))

X_train = vectorizer.fit_transform(df_kaggle['posts'])

y_train = df_kaggle['type']

X_test = vectorizer.transform(df_reddit['body'])

y_test = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test_encoded, y_pred_lr))

df_results.loc[7] = ['LogisticRegression', 'CountVec', 'Train Kaggle, Test Reddit', accuracy, precision, recall, f1]



Reddit Accuracy: 0.19724668402882928
Reddit Precision: 0.22562518996060346
Reddit Recall: 0.19724668402882928
Reddit F1-Score: 0.18629241920231834
              precision    recall  f1-score   support

           0       0.03      0.01      0.02     20936
           1       0.12      0.06      0.07     97835
           2       0.06      0.02      0.03     43642
           3       0.19      0.06      0.09    194338
           4       0.01      0.01      0.01      2651
           5       0.00      0.00      0.00      7483
           6       0.00      0.00      0.00      4477
           7       0.02      0.00      0.01     12793
           8       0.17      0.27      0.21    194680
           9       0.13      0.50      0.21    176991
          10       0.29      0.14      0.19    358042
          11       0.34      0.25      0.29    452235
          12       0.02      0.01      0.02      7002
          13       0.02      0.01      0.02     11345
          14       0.02      0.01      0.0

# NB Models

### TF-IDF

In [17]:
# Train & test on Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred))

df_results.loc[8] = ['NB', 'TF-IDF', 'Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.305784022772697


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Reddit Precision: 0.32478778734350283
Reddit Recall: 0.305784022772697
Reddit F1-Score: 0.21558668068831335


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

        ENFJ       0.53      0.00      0.00      4146
        ENFP       0.39      0.00      0.01     19707
        ENTJ       0.36      0.00      0.00      8606
        ENTP       0.33      0.04      0.08     38709
        ESFJ       0.00      0.00      0.00       515
        ESFP       0.10      0.00      0.00      1530
        ESTJ       0.00      0.00      0.00       866
        ESTP       0.00      0.00      0.00      2567
        INFJ       0.34      0.09      0.14     38870
        INFP       0.34      0.06      0.11     35492
        INTJ       0.32      0.21      0.25     71610
        INTP       0.30      0.86      0.45     90657
        ISFJ       0.00      0.00      0.00      1369
        ISFP       0.00      0.00      0.00      2252
        ISTJ       0.25      0.00      0.00      3300
        ISTP       0.49      0.00      0.00     10024

    accuracy                           0.31    330220
   macro avg       0.23   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
# Train & test on Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Kaggle Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Kaggle Precision: {precision}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Kaggle Recall: {recall}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Kaggle F1-Score: {f1}")

print(classification_report(y_test, y_pred))

df_results.loc[9] = ['NB', 'TF-IDF', 'Kaggle', accuracy, precision, recall, f1]

Kaggle Accuracy: 0.23700765365099838
Kaggle Precision: 0.23506736800666486


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Kaggle Recall: 0.23700765365099838
Kaggle F1-Score: 0.16263106017249235


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00      1821
        ENFP       0.29      0.01      0.02      6416
        ENTJ       0.17      0.00      0.00      2198
        ENTP       0.23      0.01      0.02      6605
        ESFJ       0.00      0.00      0.00       397
        ESFP       0.00      0.00      0.00       428
        ESTJ       0.00      0.00      0.00       376
        ESTP       0.00      0.00      0.00       848
        INFJ       0.23      0.20      0.21     13998
        INFP       0.24      0.77      0.36     17392
        INTJ       0.23      0.07      0.10     10226
        INTP       0.25      0.20      0.22     12288
        ISFJ       0.11      0.00      0.00      1577
        ISFP       0.22      0.00      0.00      2492
        ISTJ       0.50      0.00      0.00      1925
        ISTP       0.35      0.00      0.01      3196

    accuracy                           0.24     82183
   macro avg       0.18   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [14]:
# Train on Reddit, test on Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_reddit['body'])

y_train = df_reddit['class']

X_test = vectorizer.transform(df_kaggle['posts'])

y_test = df_kaggle['type']

model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred))

df_results.loc[10] = ['NB', 'TF-IDF', 'Train Reddit, Test Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.1647859046274777


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Reddit Precision: 0.20222540832838592
Reddit Recall: 0.1647859046274777
Reddit F1-Score: 0.09098147775787258


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00      9104
        ENFP       0.21      0.00      0.00     32083
        ENTJ       0.12      0.00      0.00     10988
        ENTP       0.14      0.02      0.04     33024
        ESFJ       0.00      0.00      0.00      1986
        ESFP       0.00      0.00      0.00      2141
        ESTJ       0.00      0.00      0.00      1880
        ESTP       0.00      0.00      0.00      4238
        INFJ       0.27      0.04      0.07     69990
        INFP       0.34      0.05      0.08     86959
        INTJ       0.15      0.14      0.14     51129
        INTP       0.16      0.86      0.27     61438
        ISFJ       0.00      0.00      0.00      7886
        ISFP       0.00      0.00      0.00     12460
        ISTJ       0.00      0.00      0.00      9628
        ISTP       0.29      0.00      0.00     15981

    accuracy                           0.16    410915
   macro avg       0.10   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [15]:
# Train on Kaggle, test on Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_kaggle['posts'])

y_train = df_kaggle['type']

X_test = vectorizer.transform(df_reddit['body'])

y_test = df_reddit['class']

model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred))

df_results.loc[11] = ['NB', 'TF-IDF', 'Train Kaggle, Test Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.18282296650717703
Reddit Precision: 0.23696265084964713
Reddit Recall: 0.18282296650717703
Reddit F1-Score: 0.14810504664239446
              precision    recall  f1-score   support

        ENFJ       0.00      0.00      0.00     20936
        ENFP       0.11      0.00      0.00     97835
        ENTJ       0.09      0.00      0.00     43642
        ENTP       0.24      0.01      0.02    194338
        ESFJ       0.00      0.00      0.00      2651
        ESFP       0.00      0.00      0.00      7483
        ESTJ       0.00      0.00      0.00      4477
        ESTP       0.00      0.00      0.00     12793
        INFJ       0.17      0.20      0.18    194680
        INFP       0.13      0.73      0.21    176991
        INTJ       0.31      0.06      0.10    358042
        INTP       0.35      0.24      0.29    452235
        ISFJ       0.00      0.00      0.00      7002
        ISFP       0.03      0.00      0.00     11345
        ISTJ       0.01      0.00      0.0

### CountVec

In [16]:
# Train & test on Reddit

vectorizer = CountVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range=(1, 1))

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred))

df_results.loc[12] = ['NB', 'CountVec', 'Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.2863151838168494
Reddit Precision: 0.271354271619472
Reddit Recall: 0.2863151838168494
Reddit F1-Score: 0.26771256595900367
              precision    recall  f1-score   support

        ENFJ       0.12      0.07      0.09      4146
        ENFP       0.17      0.13      0.15     19707
        ENTJ       0.13      0.06      0.08      8606
        ENTP       0.26      0.16      0.20     38709
        ESFJ       0.02      0.02      0.02       515
        ESFP       0.05      0.03      0.04      1530
        ESTJ       0.04      0.11      0.06       866
        ESTP       0.06      0.09      0.07      2567
        INFJ       0.23      0.26      0.25     38870
        INFP       0.24      0.21      0.22     35492
        INTJ       0.33      0.23      0.27     71610
        INTP       0.35      0.55      0.42     90657
        ISFJ       0.04      0.02      0.03      1369
        ISFP       0.07      0.07      0.07      2252
        ISTJ       0.08      0.05      0.06   

In [17]:
# Train & test on Kaggle

vectorizer = CountVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range=(1, 1))

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Kaggle Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Kaggle Precision: {precision}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Kaggle Recall: {recall}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Kaggle F1-Score: {f1}")

print(classification_report(y_test, y_pred))

df_results.loc[13] = ['NB', 'CountVec', 'Kaggle', accuracy, precision, recall, f1]

Kaggle Accuracy: 0.22918365112006134
Kaggle Precision: 0.2008082517892558
Kaggle Recall: 0.22918365112006134
Kaggle F1-Score: 0.20163816502668266
              precision    recall  f1-score   support

        ENFJ       0.07      0.01      0.02      1831
        ENFP       0.18      0.10      0.13      6380
        ENTJ       0.10      0.03      0.05      2192
        ENTP       0.17      0.08      0.11      6585
        ESFJ       0.02      0.01      0.01       398
        ESFP       0.02      0.01      0.01       442
        ESTJ       0.04      0.02      0.02       379
        ESTP       0.03      0.01      0.01       874
        INFJ       0.23      0.26      0.25     13902
        INFP       0.26      0.50      0.34     17383
        INTJ       0.21      0.18      0.19     10380
        INTP       0.24      0.24      0.24     12339
        ISFJ       0.08      0.01      0.03      1546
        ISFP       0.09      0.02      0.04      2472
        ISTJ       0.12      0.04      0.06

In [18]:
# Train on Reddit, test on Kaggle

vectorizer = CountVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range=(1, 1))

X_train = vectorizer.fit_transform(df_reddit['body'])

y_train = df_reddit['class']

X_test = vectorizer.transform(df_kaggle['posts'])

y_test = df_kaggle['type']

model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred))

df_results.loc[14] = ['NB', 'CountVec', 'Train Reddit, Test Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.19602107491817042
Reddit Precision: 0.18478044726203807
Reddit Recall: 0.19602107491817042
Reddit F1-Score: 0.17349041767447462
              precision    recall  f1-score   support

        ENFJ       0.03      0.01      0.01      9104
        ENFP       0.14      0.08      0.10     32083
        ENTJ       0.09      0.01      0.02     10988
        ENTP       0.13      0.10      0.11     33024
        ESFJ       0.00      0.00      0.00      1986
        ESFP       0.01      0.01      0.01      2141
        ESTJ       0.00      0.00      0.00      1880
        ESTP       0.02      0.01      0.01      4238
        INFJ       0.23      0.20      0.22     69990
        INFP       0.30      0.21      0.25     86959
        INTJ       0.17      0.21      0.18     51129
        INTP       0.19      0.50      0.27     61438
        ISFJ       0.07      0.01      0.01      7886
        ISFP       0.06      0.01      0.02     12460
        ISTJ       0.05      0.01      0.0

In [19]:
# Train on Kaggle, test on Reddit

vectorizer = CountVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range=(1, 1))

X_train = vectorizer.fit_transform(df_kaggle['posts'])

y_train = df_kaggle['type']

X_test = vectorizer.transform(df_reddit['body'])

y_test = df_reddit['class']

model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred))

df_results.loc[15] = ['NB', 'CountVec', 'Train Kaggle, Test Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.2048361698261765
Reddit Precision: 0.2306031875696996
Reddit Recall: 0.2048361698261765
Reddit F1-Score: 0.19788045322519768
              precision    recall  f1-score   support

        ENFJ       0.03      0.01      0.01     20936
        ENFP       0.12      0.06      0.08     97835
        ENTJ       0.06      0.04      0.04     43642
        ENTP       0.20      0.08      0.12    194338
        ESFJ       0.00      0.00      0.00      2651
        ESFP       0.00      0.00      0.00      7483
        ESTJ       0.00      0.00      0.00      4477
        ESTP       0.01      0.00      0.01     12793
        INFJ       0.19      0.25      0.21    194680
        INFP       0.14      0.48      0.22    176991
        INTJ       0.29      0.16      0.20    358042
        INTP       0.35      0.27      0.30    452235
        ISFJ       0.01      0.01      0.01      7002
        ISFP       0.02      0.02      0.02     11345
        ISTJ       0.02      0.01      0.02  

In [20]:
df_results.head(16)

Unnamed: 0,model,vectorizer,dataset,accuracy,precision,recall,f1_score
0,LogisticRegression,TF-IDF,Reddit,0.319211,0.316159,0.319211,0.270301
1,LogisticRegression,TF-IDF,Kaggle,0.238127,0.244643,0.238127,0.19192
2,LogisticRegression,TF-IDF,"Train Reddit, Test Kaggle",0.181675,0.192077,0.181675,0.137837
3,LogisticRegression,TF-IDF,"Train Kaggle, Test Reddit",0.20924,0.232232,0.20924,0.187968
4,LogisticRegression,CountVec,Reddit,0.314972,0.321212,0.314972,0.242387
5,LogisticRegression,CountVec,Kaggle,0.22933,0.2029,0.22933,0.194948
6,LogisticRegression,CountVec,"Train Reddit, Test Kaggle",0.160186,0.217866,0.160186,0.079573
7,LogisticRegression,CountVec,"Train Kaggle, Test Reddit",0.197247,0.225625,0.197247,0.186292
8,NB,TF-IDF,Reddit,0.305784,0.324788,0.305784,0.215587
9,NB,TF-IDF,Kaggle,0.237008,0.235067,0.237008,0.162631


Above we see the results of these tests. Analysis (which will be discussed in the writeup) leads to the conclusion that a Logistic Regression model using the TF-IDF vectorizer results in the best performing model.

### SVC Testing

In [27]:
vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

svc_basic = LinearSVC(
    C=1.0,         
    max_iter=1000,
    random_state=42,
    dual=False       
)

svc_basic.fit(X_train, y_train)

y_pred_basic = svc_basic.predict(X_test)
accuracy_basic = accuracy_score(y_test, y_pred_basic)

print(f"   Test accuracy: {accuracy_basic:.4f}")

   Test accuracy: 0.3156


In [28]:
vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

svc_balanced = LinearSVC(
    C=1.0,
    class_weight='balanced',
    max_iter=2000,
    loss='squared_hinge', 
    dual=False,
    random_state=42
)

svc_basic.fit(X_train, y_train)

y_pred_basic = svc_basic.predict(X_test)
accuracy_basic = accuracy_score(y_test, y_pred_basic)

print(f"   Test accuracy: {accuracy_basic:.4f}")

   Test accuracy: 0.3156


In [29]:
svc_calibrated = CalibratedClassifierCV(
    LinearSVC(C=1.0, max_iter=2000, dual=False, random_state=42),
    cv=3,
    method='sigmoid'
)

svc_calibrated.fit(X_train, y_train)

y_pred_calib = svc_calibrated.predict(X_test)
y_proba_calib = svc_calibrated.predict_proba(X_test)
accuracy_calib = accuracy_score(y_test, y_pred_calib)

print(f"   Test accuracy: {accuracy_calib:.4f}")

precision = precision_score(y_test, y_pred_calib, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_calib, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test,y_pred_calib, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred_calib))

   Test accuracy: 0.3153
Reddit Precision: 0.32061663859426603
Reddit Recall: 0.31525043910120526
Reddit F1-Score: 0.2492733189916425
              precision    recall  f1-score   support

           0       0.37      0.02      0.03      4146
           1       0.29      0.02      0.03     19707
           2       0.41      0.01      0.01      8606
           3       0.33      0.09      0.14     38709
           4       0.00      0.00      0.00       515
           5       0.22      0.00      0.00      1530
           6       0.00      0.00      0.00       866
           7       0.26      0.00      0.01      2567
           8       0.32      0.14      0.20     38870
           9       0.32      0.10      0.16     35492
          10       0.31      0.31      0.31     71610
          11       0.32      0.76      0.45     90657
          12       0.00      0.00      0.00      1369
          13       0.55      0.02      0.05      2252
          14       0.40      0.00      0.00      3300
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### SMOTE Implementation

In [30]:
vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1),
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [31]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [32]:
svc_basic = LinearSVC(
    C=1.0,             
    max_iter=1000,
    random_state=42,
    dual=False       
)

svc_basic.fit(X_train, y_train)

y_pred_basic = svc_basic.predict(X_test)
accuracy_basic = accuracy_score(y_test, y_pred_basic)

print(f"   Test accuracy: {accuracy_basic:.4f}")

   Test accuracy: 0.1346


In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
# Run model with Reddit data

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['class']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [7]:
# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

print(classification_report(y_test, y_pred_lr))



Reddit Accuracy: 0.16080188964932468
Reddit Precision: 0.2917230114265556
Reddit Recall: 0.16080188964932468
Reddit F1-Score: 0.19937225837230244
              precision    recall  f1-score   support

           0       0.05      0.19      0.07      4187
           1       0.14      0.12      0.13     19567
           2       0.08      0.13      0.10      8728
           3       0.26      0.13      0.18     38868
           4       0.01      0.18      0.01       530
           5       0.01      0.19      0.02      1497
           6       0.01      0.21      0.02       895
           7       0.03      0.17      0.05      2559
           8       0.28      0.17      0.21     38936
           9       0.26      0.16      0.20     35398
          10       0.36      0.18      0.24     71609
          11       0.40      0.18      0.24     90447
          12       0.01      0.14      0.02      1400
          13       0.02      0.16      0.04      2269
          14       0.03      0.14      0.05

### Hyperparameter Tuning

Next, hyperparameters for the Logistic Regression model will be tested to optimize model performance.

In [22]:
param_grid_lr = {
    'C': [0.01, 0.1, 1.0, 10.0],
    'solver': ['lbfgs', 'newton-cg', 'saga'] 
}

grid_search_lr = GridSearchCV(
    LogisticRegression(penalty = 'l2', multi_class='multinomial', max_iter=1000),
    param_grid_lr,
    cv=3,                     
    scoring='accuracy',                 
    n_jobs=-1,
    verbose=2
)

grid_search_lr.fit(X_train, y_train)

print("\nBest Logistic Regression Parameters:")
print(grid_search_lr.best_params_)
print(f"Best Cross-Validation Score: {grid_search_lr.best_score_:.4f}")

Fitting 3 folds for each of 12 candidates, totalling 36 fits




[CV] END ...........................C=0.01, solver=newton-cg; total time=  54.5s
[CV] END ...........................C=0.01, solver=newton-cg; total time=  54.8s




[CV] END ...........................C=0.01, solver=newton-cg; total time=  55.5s




[CV] END ...............................C=0.01, solver=lbfgs; total time= 1.1min
[CV] END ...............................C=0.01, solver=lbfgs; total time= 1.1min




[CV] END ................................C=0.01, solver=saga; total time= 1.2min




[CV] END ................................C=0.01, solver=saga; total time= 1.2min




[CV] END ...............................C=0.01, solver=lbfgs; total time= 1.3min




[CV] END ................................C=0.01, solver=saga; total time= 1.3min




[CV] END .................................C=0.1, solver=saga; total time= 1.4min




[CV] END ............................C=0.1, solver=newton-cg; total time= 1.7min
[CV] END ............................C=0.1, solver=newton-cg; total time= 1.6min




[CV] END ............................C=0.1, solver=newton-cg; total time= 1.7min




[CV] END .................................C=0.1, solver=saga; total time= 1.3min




[CV] END .................................C=0.1, solver=saga; total time= 1.3min




[CV] END ................................C=0.1, solver=lbfgs; total time= 3.8min




[CV] END ................................C=0.1, solver=lbfgs; total time= 4.1min




[CV] END ................................C=0.1, solver=lbfgs; total time= 4.0min




[CV] END .................................C=1.0, solver=saga; total time= 1.5min




[CV] END .................................C=1.0, solver=saga; total time= 1.4min




[CV] END .................................C=1.0, solver=saga; total time= 1.3min




[CV] END ............................C=1.0, solver=newton-cg; total time= 7.0min




[CV] END ............................C=1.0, solver=newton-cg; total time= 7.5min




[CV] END ............................C=1.0, solver=newton-cg; total time= 7.7min




[CV] END ................................C=1.0, solver=lbfgs; total time= 9.8min




[CV] END ................................C=1.0, solver=lbfgs; total time=10.3min




[CV] END ................................C=1.0, solver=lbfgs; total time=10.8min




[CV] END ................................C=10.0, solver=saga; total time= 3.3min




[CV] END ................................C=10.0, solver=saga; total time= 3.2min
[CV] END ................................C=10.0, solver=saga; total time= 2.7min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...............................C=10.0, solver=lbfgs; total time=15.1min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...............................C=10.0, solver=lbfgs; total time=14.7min


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...............................C=10.0, solver=lbfgs; total time=13.1min
[CV] END ...........................C=10.0, solver=newton-cg; total time=13.1min
[CV] END ...........................C=10.0, solver=newton-cg; total time=11.7min
[CV] END ...........................C=10.0, solver=newton-cg; total time=13.1min





Best Logistic Regression Parameters:
{'C': 1.0, 'solver': 'saga'}
Best Cross-Validation Score: 0.3150


It is determined that a C of 1.0 and the saga solver (along with l2 regularization) are optimal for this model.

### Use the model on individual MBTI components

Next, the model will be trained and tested on the invidual MBTI components.

In [5]:
# Results df for analysis
df_results_breakdown = pd.DataFrame(columns=[
    'mbti component', 
    'dataset', 
    'accuracy', 
    'precision', 
    'recall', 
    'f1_score'
])

In [None]:
# I/E - Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['Introvert']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[0] = ['I/E', 'Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.599027920780086
Reddit Precision: 0.6841687550329978
Reddit Recall: 0.599027920780086
Reddit F1-Score: 0.6273626599232306


In [9]:
# S/N - Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['Sensing']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[1] = ['S/N', 'Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.677009266549573
Reddit Precision: 0.8865199103644574
Reddit Recall: 0.677009266549573
Reddit F1-Score: 0.7564363853236461


In [10]:
# T/F - Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['Thinking']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[2] = ['T/F', 'Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.6138120041184665
Reddit Precision: 0.6597719965851747
Reddit Recall: 0.6138120041184665
Reddit F1-Score: 0.6267227609866552


In [11]:
# J/P - Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_reddit['body'])

y = df_reddit['Judging']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[3] = ['J/P', 'Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.5593452849615408
Reddit Precision: 0.5774112516275652
Reddit Recall: 0.5593452849615408
Reddit F1-Score: 0.564316667715644


In [12]:
# I/E - Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['Introvert']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[4] = ['I/E', 'Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.6368470364917319
Reddit Precision: 0.6604766674789874
Reddit Recall: 0.6368470364917319
Reddit F1-Score: 0.6475668067445254


In [13]:
# S/N - Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['Sensing']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[5] = ['S/N', 'Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.7075672584354429
Reddit Precision: 0.7771824237799397
Reddit Recall: 0.7075672584354429
Reddit F1-Score: 0.7375172281660003


In [14]:
# T/F - Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['Thinking']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[6] = ['T/F', 'Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.601328741953932
Reddit Precision: 0.6026284657936227
Reddit Recall: 0.601328741953932
Reddit F1-Score: 0.6017929442278941


In [15]:
# J/P - Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X = vectorizer.fit_transform(df_kaggle['posts'])

y = df_kaggle['Judging']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[7] = ['J/P', 'Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.5494810362240367
Reddit Precision: 0.5514008383323892
Reddit Recall: 0.5494810362240367
Reddit F1-Score: 0.5503903692805392


In [16]:
# I/E - Train Reddit, Test Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_reddit['body'])

y_train = df_reddit['Introvert']

X_test = vectorizer.transform(df_kaggle['posts'])

y_test = df_kaggle['Introvert']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train_encoded = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[8] = ['I/E', 'Train Reddit, Test Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.5556745312290863
Reddit Precision: 0.6672968489917573
Reddit Recall: 0.5556745312290863
Reddit F1-Score: 0.5896641872698957


In [17]:
# S/N - Train Reddit, Test Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_reddit['body'])

y_train = df_reddit['Sensing']

X_test = vectorizer.transform(df_kaggle['posts'])

y_test = df_kaggle['Sensing']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train_encoded = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[9] = ['S/N', 'Train Reddit, Test Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.6323302872856917
Reddit Precision: 0.7779562836165688
Reddit Recall: 0.6323302872856917
Reddit F1-Score: 0.6862086782493854


In [18]:
# T/F - Train Reddit, Test Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_reddit['body'])

y_train = df_reddit['Thinking']

X_test = vectorizer.transform(df_kaggle['posts'])

y_test = df_kaggle['Thinking']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train_encoded = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[10] = ['T/F', 'Train Reddit, Test Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.5900198337855761
Reddit Precision: 0.5885752932939503
Reddit Recall: 0.5900198337855761
Reddit F1-Score: 0.5889665418318212


In [19]:
# J/P - Train Reddit, Test Kaggle

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_reddit['body'])

y_train = df_reddit['Judging']

X_test = vectorizer.transform(df_kaggle['posts'])

y_test = df_kaggle['Judging']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train_encoded = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[11] = ['J/P', 'Train Reddit, Test Kaggle', accuracy, precision, recall, f1]

Reddit Accuracy: 0.5260236301911587
Reddit Precision: 0.5546418366287544
Reddit Recall: 0.5260236301911587
Reddit F1-Score: 0.5307723844692843


In [20]:
# I/E - Train Kaggle, Test Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_kaggle['posts'])

y_train = df_kaggle['Introvert']

X_test = vectorizer.transform(df_reddit['body'])

y_test = df_reddit['Introvert']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train_encoded = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[12] = ['I/E', 'Train Kaggle, Test Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.6094906426019018
Reddit Precision: 0.66141908480255
Reddit Recall: 0.6094906426019018
Reddit F1-Score: 0.6304107022469019


In [21]:
# S/N - Train Kaggle, Test Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_kaggle['posts'])

y_train = df_kaggle['Sensing']

X_test = vectorizer.transform(df_reddit['body'])

y_test = df_reddit['Sensing']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train_encoded = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[13] = ['S/N', 'Train Kaggle, Test Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.6843795045727091
Reddit Precision: 0.8780164352481126
Reddit Recall: 0.6843795045727091
Reddit F1-Score: 0.7610736966203913


In [22]:
# T/F - Train Kaggle, Test Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_kaggle['posts'])

y_train = df_kaggle['Thinking']

X_test = vectorizer.transform(df_reddit['body'])

y_test = df_reddit['Thinking']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train_encoded = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[14] = ['T/F', 'Train Kaggle, Test Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.5916449639634184
Reddit Precision: 0.6548358328825197
Reddit Recall: 0.5916449639634184
Reddit F1-Score: 0.6062334478899197


In [23]:
# J/P - Train Kaggle, Test Reddit

vectorizer = TfidfVectorizer(
    max_features=20000,
    min_df=10,
    ngram_range = (1,1)
)

X_train = vectorizer.fit_transform(df_kaggle['posts'])

y_train = df_kaggle['Judging']

X_test = vectorizer.transform(df_reddit['body'])

y_test = df_reddit['Judging']

# Encode classes as numerical values for logistic regression
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train, y_train_encoded = smote.fit_resample(X_train, y_train)

# saga solver is stochastic gradient descent model for multiclass classification
logistic_regression_model = LogisticRegression(penalty='l2', C=1.0, solver='saga', max_iter=1000)
logistic_regression_model.fit(X_train, y_train_encoded)

y_pred_lr = logistic_regression_model.predict(X_test)

accuracy = accuracy_score(y_test_encoded, y_pred_lr)
print(f"Reddit Accuracy: {accuracy}")

precision = precision_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Precision: {precision}")

recall = recall_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit Recall: {recall}")

f1 = f1_score(y_test_encoded, y_pred_lr, average='weighted')
print(f"Reddit F1-Score: {f1}")

df_results_breakdown.loc[15] = ['J/P', 'Train Kaggle, Test Reddit', accuracy, precision, recall, f1]

Reddit Accuracy: 0.5253915571437223
Reddit Precision: 0.5202757480914181
Reddit Recall: 0.5253915571437223
Reddit F1-Score: 0.5225550292741797


In [24]:
df_results_breakdown.head(16)

Unnamed: 0,mbti component,dataset,accuracy,precision,recall,f1_score
0,I/E,Reddit,0.599028,0.684169,0.599028,0.627363
1,S/N,Reddit,0.677009,0.88652,0.677009,0.756436
2,T/F,Reddit,0.613812,0.659772,0.613812,0.626723
3,J/P,Reddit,0.559345,0.577411,0.559345,0.564317
4,I/E,Kaggle,0.636847,0.660477,0.636847,0.647567
5,S/N,Kaggle,0.707567,0.777182,0.707567,0.737517
6,T/F,Kaggle,0.601329,0.602628,0.601329,0.601793
7,J/P,Kaggle,0.549481,0.551401,0.549481,0.55039
8,I/E,"Train Reddit, Test Kaggle",0.555675,0.667297,0.555675,0.589664
9,S/N,"Train Reddit, Test Kaggle",0.63233,0.777956,0.63233,0.686209


This df will be used in analysis in the writeup