In [13]:
import nltk
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import json
from joblib import dump

In [2]:
with open("../data/aff_neg_data.json") as f:
    content = json.load(f)

examples = []

for item in content:
    for value in item["examples"]:
        examples.append([value, item["class"]])

In [3]:
df = pd.DataFrame(examples, columns=["example", "label"])
df

Unnamed: 0,example,label
0,"Yes, thank you!",positive
1,"Yes, I do!",positive
2,Yes please.,positive
3,Yes of course.,positive
4,Yes.,positive
5,"No, thank you!",negative
6,I don't want anything else.,negative
7,"No, nothing else.",negative
8,No.,negative


In [4]:
le = LabelEncoder()
le.fit(df["label"])

LabelEncoder()

In [5]:
df["label"] = le.transform(df["label"])
df

Unnamed: 0,example,label
0,"Yes, thank you!",1
1,"Yes, I do!",1
2,Yes please.,1
3,Yes of course.,1
4,Yes.,1
5,"No, thank you!",0
6,I don't want anything else.,0
7,"No, nothing else.",0
8,No.,0


In [6]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["example"])

In [7]:
X.shape

(9, 13)

In [8]:
classifier = MultinomialNB()
classifier.fit(X.toarray(), df["label"])

MultinomialNB()

In [9]:
preds = []
for i, row in df.iterrows():
    vector = vectorizer.transform([row["example"]])
    result = classifier.predict(vector.toarray())
    preds.append(result)

preds

[array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([1]),
 array([0]),
 array([0]),
 array([0]),
 array([0])]

In [10]:
acc_score = accuracy_score(df["label"], preds)
acc_score

1.0

In [11]:
model_path = "../models/aff_neg_clf.joblib"
dump(classifier, model_path)

['../models/aff_neg_clf.joblib']

In [12]:
vector = vectorizer.transform(["No, but I would like something else."])
result = classifier.predict(vector.toarray())
result

array([0])

In [14]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("mnb", MultinomialNB())
])

In [15]:
pipe.fit(df["example"], df["label"])

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])

In [17]:
filepath = "../models/aff_neg_pipe.plk"
dump(pipe, filepath, compress=1)

['../models/aff_neg_pipe.plk']