In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import spacy
from joblib import dump, load

In [2]:
df = pd.read_csv("movie_reviews.csv")
df.head()

Unnamed: 0,text,label
0,It's been about 14 years since Sharon Stone aw...,0
1,someone needed to make a car payment... this i...,0
2,The Guidelines state that a comment must conta...,0
3,This movie is a muddled mish-mash of clichés f...,0
4,Before Stan Laurel became the smaller half of ...,0


In [3]:
df["label"].value_counts()

1    2514
0    2486
Name: label, dtype: int64

In [4]:
!python -m spacy download en

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/aajalloe/opt/anaconda3/lib/python3.8/site-packages/en_core_web_sm -->
/Users/aajalloe/opt/anaconda3/lib/python3.8/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [5]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [6]:
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [7]:
df["text"] = df["text"].map(clean_text)

In [8]:
df["tokenized"] = df["text"].map(spacy_tokenizer)
df["tokenized"] = df["tokenized"].map(lambda x: " ".join(x))

In [9]:
tfidf_vector = TfidfVectorizer()

In [10]:
from sklearn.model_selection import train_test_split

X = df['tokenized'] # the features we want to analyze
ylabels = df['label'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=72)

In [11]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver="lbfgs")

# Create pipeline using Bag of Words
pipe = Pipeline([('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [12]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print(" test Accuracy:",metrics.accuracy_score(y_test, predicted))
print(" Precision:",metrics.precision_score(y_test, predicted))
print(" Recall:",metrics.recall_score(y_test, predicted))

 test Accuracy: 0.859
 Precision: 0.8644400785854617
 Recall: 0.859375


In [13]:
#save the model
dump(pipe, 'text_classifier.joblib') 

['text_classifier.joblib']

In [14]:
clf = load('text_classifier.joblib')

In [15]:
s = clean_text("This was rubbish")
s = " ".join(spacy_tokenizer(s))
clf.predict(pd.Series([s]))

array([0])