# Model selection

In this notebook, we will benchmark different configurations (vectorization and classification) to find the best one to our use case.

## Libraries

In [36]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

## Data import

In [3]:
df = pd.read_csv('../mbti_data_preproc.csv')
df.head()

Unnamed: 0,type,posts_preproc_filtered
0,INFJ,"['http', 'youtub', 'watch', 'v', 'qsxhcwe3krw'..."
1,ENTP,"['find', 'lack', 'post', 'alarm', 'sex', 'bore..."
2,INTP,"['http', 'youtub', 'watch', 'v', 'fhigbolffgw'..."
3,INTJ,"['dear', 'intp', 'enjoy', 'convers', 'day', 'e..."
4,ENTJ,"['fire', 'anoth', 'silli', 'misconcept', 'appr..."


## Vectorization

We benchmark 2 methods:
- CountVectorizer
- TfidfVectorizer

In [29]:
corpus = df['posts_preproc_filtered']
y = df['type']
labels = y.unique()

c_vectorizer = CountVectorizer()
ti_vectorizer = TfidfVectorizer()

In [30]:
c_X = c_vectorizer.fit_transform(corpus)
ti_X = ti_vectorizer.fit_transform(corpus)

In [31]:
c_X_train, c_X_test, c_y_train, c_y_test = train_test_split(c_X, y, test_size=0.33, random_state=14)
ti_X_train, ti_X_test, ti_y_train, ti_y_test = train_test_split(ti_X, y, test_size=0.33, random_state=14)

## Prediction

**Configuration 1:** CountVectorizer + LinearSVC

In [32]:
c_clf = LinearSVC(max_iter=1000, random_state=14)
c_clf.fit(c_X_train, c_y_train)
c_y_pred = c_clf.predict(c_X_test)
print(classification_report(c_y_test, c_y_pred, target_names=labels))

              precision    recall  f1-score   support

        INFJ       0.43      0.18      0.26        66
        ENTP       0.54      0.49      0.51       210
        INTP       0.61      0.39      0.48        76
        INTJ       0.57      0.58      0.57       223
        ENTJ       0.17      0.11      0.13         9
        ENFJ       0.25      0.07      0.11        14
        INFP       0.50      0.21      0.30        14
        ENFP       0.63      0.38      0.47        32
        ISFP       0.57      0.62      0.59       456
        ISTP       0.66      0.73      0.69       627
        ISFJ       0.55      0.58      0.56       365
        ISTJ       0.62      0.67      0.64       462
        ESTP       0.53      0.50      0.51        52
        ESFP       0.45      0.34      0.39        79
        ESTJ       0.48      0.39      0.43        64
        ESFJ       0.58      0.56      0.57       114

    accuracy                           0.59      2863
   macro avg       0.51   

**Configuration 2:** TfidfVectorizer + LinearSVC

In [33]:
ti_clf = LinearSVC(random_state=14)
ti_clf.fit(ti_X_train, ti_y_train)
ti_y_pred = ti_clf.predict(ti_X_test)
print(classification_report(ti_y_test, ti_y_pred, target_names=labels))

              precision    recall  f1-score   support

        INFJ       0.67      0.21      0.32        66
        ENTP       0.65      0.56      0.60       210
        INTP       0.81      0.45      0.58        76
        INTJ       0.68      0.63      0.66       223
        ENTJ       0.20      0.11      0.14         9
        ENFJ       1.00      0.07      0.13        14
        INFP       0.50      0.14      0.22        14
        ENFP       0.73      0.34      0.47        32
        ISFP       0.64      0.68      0.66       456
        ISTP       0.68      0.85      0.76       627
        ISFJ       0.62      0.62      0.62       365
        ISTJ       0.68      0.76      0.72       462
        ESTP       0.69      0.56      0.62        52
        ESFP       0.62      0.38      0.47        79
        ESTJ       0.74      0.41      0.53        64
        ESFJ       0.70      0.67      0.68       114

    accuracy                           0.67      2863
   macro avg       0.66   

**Configuration 3:** CountVectorizer + MultinomialNB

In [34]:
c_clf = MultinomialNB()
c_clf.fit(c_X_train, c_y_train)
c_y_pred = c_clf.predict(c_X_test)
print(classification_report(c_y_test, c_y_pred, target_names=labels))

              precision    recall  f1-score   support

        INFJ       0.00      0.00      0.00        66
        ENTP       1.00      0.01      0.02       210
        INTP       0.00      0.00      0.00        76
        INTJ       0.40      0.01      0.02       223
        ENTJ       0.00      0.00      0.00         9
        ENFJ       0.00      0.00      0.00        14
        INFP       0.00      0.00      0.00        14
        ENFP       0.00      0.00      0.00        32
        ISFP       0.33      0.60      0.43       456
        ISTP       0.35      0.88      0.51       627
        ISFJ       0.64      0.15      0.25       365
        ISTJ       0.50      0.41      0.45       462
        ESTP       0.00      0.00      0.00        52
        ESFP       0.00      0.00      0.00        79
        ESTJ       0.00      0.00      0.00        64
        ESFJ       0.00      0.00      0.00       114

    accuracy                           0.38      2863
   macro avg       0.20   

  _warn_prf(average, modifier, msg_start, len(result))


**Configuration 4:** TfidfVectorizer + MultinomialNB

In [35]:
ti_clf = MultinomialNB()
ti_clf.fit(ti_X_train, ti_y_train)
ti_y_pred = ti_clf.predict(ti_X_test)
print(classification_report(ti_y_test, ti_y_pred, target_names=labels))

              precision    recall  f1-score   support

        INFJ       0.00      0.00      0.00        66
        ENTP       0.00      0.00      0.00       210
        INTP       0.00      0.00      0.00        76
        INTJ       0.00      0.00      0.00       223
        ENTJ       0.00      0.00      0.00         9
        ENFJ       0.00      0.00      0.00        14
        INFP       0.00      0.00      0.00        14
        ENFP       0.00      0.00      0.00        32
        ISFP       0.36      0.01      0.02       456
        ISTP       0.22      1.00      0.36       627
        ISFJ       0.00      0.00      0.00       365
        ISTJ       0.00      0.00      0.00       462
        ESTP       0.00      0.00      0.00        52
        ESFP       0.00      0.00      0.00        79
        ESTJ       0.00      0.00      0.00        64
        ESFJ       0.00      0.00      0.00       114

    accuracy                           0.22      2863
   macro avg       0.04   

  _warn_prf(average, modifier, msg_start, len(result))


**Configuration 5:** CountVectorizer + LogisticRegression

In [38]:
c_clf = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=14)
c_clf.fit(c_X_train, c_y_train)
c_y_pred = c_clf.predict(c_X_test)
print(classification_report(c_y_test, c_y_pred, target_names=labels))

              precision    recall  f1-score   support

        INFJ       0.43      0.23      0.30        66
        ENTP       0.57      0.51      0.54       210
        INTP       0.63      0.45      0.52        76
        INTJ       0.59      0.62      0.60       223
        ENTJ       0.11      0.11      0.11         9
        ENFJ       0.43      0.21      0.29        14
        INFP       0.50      0.29      0.36        14
        ENFP       0.61      0.44      0.51        32
        ISFP       0.60      0.63      0.61       456
        ISTP       0.68      0.75      0.72       627
        ISFJ       0.60      0.58      0.59       365
        ISTJ       0.64      0.69      0.66       462
        ESTP       0.56      0.56      0.56        52
        ESFP       0.48      0.37      0.42        79
        ESTJ       0.57      0.52      0.54        64
        ESFJ       0.65      0.65      0.65       114

    accuracy                           0.62      2863
   macro avg       0.54   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Configuration 6:** TfidfVectorizer + LogisticRegression

In [39]:
ti_clf = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=14)
ti_clf.fit(ti_X_train, ti_y_train)
ti_y_pred = ti_clf.predict(ti_X_test)
print(classification_report(ti_y_test, ti_y_pred, target_names=labels))

              precision    recall  f1-score   support

        INFJ       0.59      0.48      0.53        66
        ENTP       0.65      0.60      0.62       210
        INTP       0.58      0.59      0.59        76
        INTJ       0.68      0.67      0.67       223
        ENTJ       0.10      0.11      0.11         9
        ENFJ       0.18      0.21      0.19        14
        INFP       0.67      0.57      0.62        14
        ENFP       0.52      0.53      0.52        32
        ISFP       0.79      0.63      0.70       456
        ISTP       0.75      0.77      0.76       627
        ISFJ       0.66      0.65      0.66       365
        ISTJ       0.69      0.78      0.73       462
        ESTP       0.57      0.69      0.63        52
        ESFP       0.53      0.61      0.57        79
        ESTJ       0.64      0.67      0.66        64
        ESFJ       0.62      0.77      0.68       114

    accuracy                           0.69      2863
   macro avg       0.58   

In [40]:
ti_clf = LogisticRegression(class_weight='balanced', multi_class='ovr', max_iter=1000, random_state=14)
ti_clf.fit(ti_X_train, ti_y_train)
ti_y_pred = ti_clf.predict(ti_X_test)
print(classification_report(ti_y_test, ti_y_pred, target_names=labels))

              precision    recall  f1-score   support

        INFJ       0.67      0.36      0.47        66
        ENTP       0.65      0.59      0.62       210
        INTP       0.68      0.50      0.58        76
        INTJ       0.67      0.66      0.67       223
        ENTJ       0.12      0.11      0.12         9
        ENFJ       0.20      0.07      0.11        14
        INFP       0.71      0.36      0.48        14
        ENFP       0.62      0.41      0.49        32
        ISFP       0.73      0.67      0.70       456
        ISTP       0.72      0.82      0.77       627
        ISFJ       0.65      0.66      0.65       365
        ISTJ       0.69      0.80      0.74       462
        ESTP       0.63      0.56      0.59        52
        ESFP       0.63      0.48      0.55        79
        ESTJ       0.73      0.58      0.64        64
        ESFJ       0.68      0.71      0.70       114

    accuracy                           0.69      2863
   macro avg       0.61   