In [1]:
import string
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer

from sklearn.pipeline import Pipeline

from sklearn.compose import ColumnTransformer 

from sklearn.preprocessing import FunctionTransformer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import ComplementNB

from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv("riasec_agrea.csv")
df.head()

Unnamed: 0,gender,text,agrea_q
0,1,Assemble electronic parts\nStudy the structure...,3
1,1,Lay brick or tile\nFix a broken faucet\nInstal...,3
2,1,Lay brick or tile\nAssemble electronic parts\n...,2
3,1,Test the quality of parts before shipment\nLay...,3
4,1,Work on an offshore oil-drilling rig\nOperate ...,3


In [3]:
def fn_stemmer(x):
    stemmer = PorterStemmer()
    def fn(y): return " ".join([ stemmer.stem(word) for word in word_tokenize(y) ])
    fn_v = np.vectorize(fn)
    return fn_v(x)

In [4]:
preprocess_pipe = Pipeline([
    ("col_trans", ColumnTransformer([
         ("text", Pipeline([
             ("stemmer", FunctionTransformer(fn_stemmer)),
             ("tfidf_vectorizer", TfidfVectorizer(
                encoding = "utf-8",
                lowercase = True,
                stop_words =  nltk.corpus.stopwords.words('english') + (string.punctuation).split(),
                ngram_range = (1,1),
                max_features = None,
                use_idf = True,
            ))   
         ]), "text"),
    ],
    remainder = "passthrough"))
])

In [5]:
X = df.drop("agrea_q", axis = 1)
y = df["agrea_q"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [6]:
X_train_prepared = preprocess_pipe.fit_transform(X_train)
X_test_prepared =  preprocess_pipe.transform(X_test)

In [30]:
#preprocess_pipe["col_trans"].transformers_[0][1]["tfidf_vectorizer"].get_feature_names()

In [31]:
clf = ComplementNB()
clf.fit(X_train_prepared, y_train)
clf.score(X_test_prepared, y_test)

0.5168353265869365

In [32]:
confusion_matrix(y_test, clf.predict(X_test_prepared))

array([[ 316,  428,  326],
       [ 474,  882, 1104],
       [ 904, 2016, 4420]])