In [1]:
import pandas as pd

df_imdb = pd.read_table('./labelled-files/imdb_labelled.txt')
df_amz = pd.read_table('./labelled-files/amazon_cells_labelled.txt')
frames = [df_imdb, df_amz]

In [2]:
for colname in frames:
    colname.columns = ["Message","Target"]
    print(colname.columns)

Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')


In [3]:
df = pd.concat(frames)

In [4]:
df.head()

Unnamed: 0,Message,Target
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0


In [5]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')


In [6]:
stopwords = list(STOP_WORDS)

In [7]:
df.head()

Unnamed: 0,Message,Target
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0


In [8]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.svm import LinearSVC 
from sklearn.pipeline import Pipeline 

CountVectorizer: This package is used to transform the texts in our dataset into numeric values that are in vectors. These numeric values can be accessed by the model more quickly than text.

TfidfVectorizer: is a statistical measure that evaluates how relevant a word is in a collection of documents.

If a word is common in a given document and common in other documents, it indicates that it has less power when making a prediction.

Conversely, if a word is unique in a document, it shows it has more power in classification and predictive analysis.

In [9]:
def remove_stopwords(list):
    for index, value in enumerate(list):
        doc = nlp(value)
        token = ""
        for word in doc:
            if word.is_stop == False and word.is_punct == False:
                 token = word.text + " " + token
        list[index] = token
    return pd.Series(list)
            
class CleanData(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = remove_stopwords(X.tolist())
        return X

In [10]:
# Vectorization
vectorizer = CountVectorizer() 
tfvectorizer = TfidfVectorizer()
classifier = LinearSVC()

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Features and Labels
X = df['Message']
y = df['Target']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [14]:
pipe = Pipeline([('cleaner', CleanData()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

In [15]:
pipe.fit(X_train,y_train)



In [16]:
# Accuracy
print("Accuracy: ",pipe.score(X_train,y_train))

Accuracy:  0.9900687547746372


In [17]:
pipe.predict(pd.Series(["I recommend this movie to watch, it's great"]))

array([1])

In [18]:
example = pd.Series(["I love this product so much",
 "What an inferior item! I will purchase a new one",
 "I feel happy when using your product!"])
       

In [19]:
pipe.predict(example)

array([1, 0, 1])