In [1]:
import nltk
import random
import string
import pandas as pd

from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import pickle

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\weeck\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
questions = []
classes = []
docs = []

file = open('classClassification.txt', 'r')
lines = file.readlines()

for line in lines:
    elements = line.split(' (')
    if elements[1][:-2] in ['scikit-learn', 'Pandas Library', 'Docker', 'Tableau']:
        questions.append(elements[0])
        classes.append(elements[1][:-2])
        docs.append([elements[0], elements[1][:-2]])

dict = {'question':questions, 'class':classes}
df = pd.DataFrame(dict)

df

Unnamed: 0,question,class
0,How can I import and manipulate large datasets...,Pandas Library
1,What is the best way to perform data preproces...,scikit-learn
2,How can I package my application and its depen...,Docker
3,Is there a tool that can help me create intera...,Tableau
4,Can I use Python to create visualizations and ...,Pandas Library
...,...,...
1700,How can I handle imbalanced classes in my data...,scikit-learn
1701,What is the most reliable way to deploy my app...,Docker
1702,How can I perform statistical analysis and hyp...,Pandas Library
1703,What is the best way to handle text data and p...,scikit-learn


In [4]:
def get_tokens(text):
    tokens = text.split(' ')
    tokens = [t for t in tokens if not t in stop_words]
    return tokens

# Clans text
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

def print_frequency_dist(docs):
    tokens = defaultdict(list)

    for doc in docs:
        doc_label = doc[1]
        doc_text = clean_text(doc[0])
        doc_tokens = get_tokens(doc_text)
        tokens[doc_label].extend(doc_tokens)

    for category_label, category_tokens in tokens.items():
        print(category_label)
        fd = FreqDist(category_tokens)
        print(fd.most_common(20))

def get_splits(docs):
    random.shuffle(docs)

    X_train = []
    Y_train = []
    X_test = []
    Y_test = []

    pivot = int(.80 * len(docs))

    for i in range(0, pivot):
        X_train.append(docs[i][0])
        Y_train.append(docs[i][1])
    for i in range(pivot, len(docs)):
        X_test.append(docs[i][0])
        Y_test.append(docs[i][1])

    return X_train, X_test, Y_train, Y_test

def evaluate_classifier(title, classifier, vectorizer, X_test, Y_test):
    X_test_tfidf = vectorizer.transform(X_test)
    Y_pred = classifier.predict(X_test_tfidf)

    precission = metrics.precision_score(Y_test, Y_pred, average='micro')
    recall = metrics.recall_score(Y_test, Y_pred, average='micro')
    f1 = metrics.f1_score(Y_test, Y_pred, average='micro')

    print("%s\t%f\t%f\t%f\n" % (title, precission, recall, f1))

def train_classifier(docs):
    X_train, X_test, Y_train, Y_test = get_splits(docs)

    # Vectorizer
    vectorizer = CountVectorizer(stop_words='english',
                                 ngram_range=(1,3),
                                 min_df=3,
                                 analyzer='word')
    dtm = vectorizer.fit_transform(X_train)

    # Classifier
    naive_bayes_classifier = MultinomialNB().fit(dtm, Y_train)

    evaluate_classifier("Naive Bayes\tTrain\t", naive_bayes_classifier, vectorizer, X_train, Y_train)
    evaluate_classifier("Naive Bayes\tTrain\t", naive_bayes_classifier, vectorizer, X_test, Y_test)

    # Store classifier
    clf_filename = 'classifier.pkl'
    pickle.dump(naive_bayes_classifier, open(clf_filename, 'wb'))

    # Store vectorizer
    vec_filename = 'count_vectorizer.pkl'
    pickle.dump(vectorizer, open(vec_filename, 'wb'))

def print_prob(prediction, classes):
    for i in range(len(classes)):
        print(classes[i]+': ',end='')
        print('{:.3%}'.format(prediction[i])+'\t',end='')
    print()

def classify(text):
    
    clf_filename = 'classifier.pkl'
    vec_filename = 'count_vectorizer.pkl'

    nb_clf = pickle.load(open(clf_filename, 'rb'))
    vectorizer = pickle.load(open(vec_filename, 'rb'))

    pred = nb_clf.predict_proba(vectorizer.transform([text]))


    print_prob(pred[0], nb_clf.classes_)


#  print_frequency_dist(docs)
train_classifier(docs)

classify('I want to get insight on my chair data he feh')


Naive Bayes	Train		0.958211	0.958211	0.958211

Naive Bayes	Train		0.947214	0.947214	0.947214

Docker: 9.625%	Pandas Library: 14.208%	Tableau: 66.131%	scikit-learn: 10.036%	
