In [1]:
import string
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix

In [15]:
df = pd.read_csv("mies_ie.csv")
df.shape

(5460, 3)

In [3]:
def fn_stemmer(x):
    stemmer = PorterStemmer()
    def fn(y): return " ".join([ stemmer.stem(word) for word in word_tokenize(y) ])
    fn_v = np.vectorize(fn)
    return fn_v(x)

In [4]:
preprocess_pipe = Pipeline([
    ("col_trans", ColumnTransformer([
         ("text", Pipeline([
             ("stemmer", FunctionTransformer(fn_stemmer)),
             ("tfidf_vectorizer", TfidfVectorizer(
                encoding = "utf-8",
                lowercase = True,
                stop_words =  nltk.corpus.stopwords.words('english') + (string.punctuation).split(),
                ngram_range = (1,1),
                max_features = None,
                use_idf = True,
            ))   
         ]), "text"),
    ],
    remainder = "passthrough"))
])

In [5]:
X = df.drop("IE", axis = 1)
y = df["IE"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [11]:
X_train_prepared = preprocess_pipe.fit_transform(X_train)
X_test_prepared =  preprocess_pipe.transform(X_test)
X_train_prepared

array([[0.        , 0.        , 0.08161251, ..., 0.23973173, 0.        ,
        2.        ],
       [0.08943755, 0.        , 0.06703988, ..., 0.14769416, 0.        ,
        0.        ],
       [0.        , 0.        , 0.07432499, ..., 0.21832509, 0.        ,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.2065916 , 0.09211401,
        2.        ],
       [0.        , 0.        , 0.06795925, ..., 0.24953264, 0.08900827,
        1.        ],
       [0.        , 0.        , 0.05837264, ..., 0.34293211, 0.07645241,
        1.        ]])

In [10]:
#preprocess_pipe["col_trans"].transformers_[0][1]["tfidf_vectorizer"].get_feature_names()

In [13]:
clf = ComplementNB()
clf.fit(X_train_prepared, y_train)
clf.score(X_test_prepared, y_test)

0.3838827838827839

In [17]:
confusion_matrix(y_test, clf.predict(X_test_prepared))

array([[366, 168, 317],
       [ 63,  39,  64],
       [165,  64, 119]])