Arun Kalaeswaran Fantasy Football Chatbot Model
---

In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
from spacy.lang.en import English
import en_core_web_sm
#nlp = en_core_web_sm.load()

In [2]:
# Loading CSV file
df = pd.read_csv ("chatbotdata2.csv", sep=",")

In [3]:
df.head()

Unnamed: 0,intent,class
0,What is fantasy football?,about
1,what's fantasy football?,about
2,How do you play fantasy football?,about
3,What are the rules regarding fantasy football?,about
4,what are the rules for the league,about


In [4]:
df.shape

(80, 2)

In [5]:
df.info() #class can not be an object it needs to be converted to integer to be used for y-labels

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 2 columns):
intent    80 non-null object
class     80 non-null object
dtypes: object(2)
memory usage: 1.3+ KB


In [6]:
obj_df = df.select_dtypes(include=['object']).copy() #creating a variable to place all the object types in order to manipulate
obj_df["class"].value_counts() #seeing how many examples of each class we have

tiebreaker    14
points        13
about         13
playoffs      11
ppr           11
draft          9
positions      9
Name: class, dtype: int64

In [7]:
#categorical label encoding , converting our y labels from words to integers
cleanup_nums = {"class": {"about": 2, "points": 3, "ppr": 4,
                                  "positions": 5, "draft":6, "playoffs":7, "tiebreaker":8 }}

In [8]:
obj_df.replace(cleanup_nums, inplace=True) #inputing the integers into the dataset
obj_df.head()

Unnamed: 0,intent,class
0,What is fantasy football?,2
1,what's fantasy football?,2
2,How do you play fantasy football?,2
3,What are the rules regarding fantasy football?,2
4,what are the rules for the league,2


In [9]:
obj_df #looking to see entire class column is converted to integers

Unnamed: 0,intent,class
0,What is fantasy football?,2
1,what's fantasy football?,2
2,How do you play fantasy football?,2
3,What are the rules regarding fantasy football?,2
4,what are the rules for the league,2
5,what are the league rules,2
6,what's the league rules,2
7,what are the rules,2
8,Can you explain fantasy football?,2
9,Can you explain what fantasy football is?,2


In [10]:
obj_df.info() #class shows int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 2 columns):
intent    80 non-null object
class     80 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.3+ KB


In [11]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
#nlp = spacy.load('en')
nlp = en_core_web_sm.load()
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, parser, word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [12]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [13]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [14]:
from sklearn.model_selection import train_test_split

X = obj_df['intent'] # the features we want to analyze
ylabels = obj_df['class'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2) #splitting into train and test sets

In [15]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(multi_class='auto')

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x000001E354485E10>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x000001E34D4CB8C8>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_

In [16]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted,average='micro'))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted,average='micro'))

Logistic Regression Accuracy: 0.8125
Logistic Regression Precision: 0.8125
Logistic Regression Recall: 0.8125


In [17]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print(y_test, predicted)

23    3
72    8
17    3
61    7
63    7
54    5
66    8
45    6
20    3
4     2
52    5
71    8
36    4
75    8
12    2
26    4
Name: class, dtype: int64 [3 8 3 3 7 5 3 6 3 2 5 8 4 3 2 4]


In [18]:
#saving model as pickle file
import pickle
with open('classifier_pipe.dump', 'wb') as f:
    pickle.dump(pipe, f)