### Import all the necessary packages. Sklearn is the most essential library used that provides feature extraction, training SVM and evaluating the performance of the model.

In [372]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import pandas as pd
import re

### Load the dataset. Pandas is a data analysis library that lets you read the csv file from disk and load it to the working memory for further processing.

In [373]:
data = pd.read_csv('text_classification_dataset.csv')
data = shuffle(data)

### Perform a little preprocessing of the text data.

In [374]:
contractions_mapping = {"i'm": 'i am',
 "i've": 'i have',
 "you're": 'you are',
 'dont': 'do not',
 "don't": 'do not',
 "can't": 'can not',
 'cant': 'can not',
 "what's": 'what is',
 'whats': 'what is',
 "how's": 'how is',
 'hows': 'how is',
 "\\'nt": 'not',
 '^\\w\\s': '',
 "\\'s": '',
 '\\n': ' '}

def regex_clean(doc):
    doc = doc.lower()
    for k,v in contractions_mapping.items():
        doc = re.sub(k,v,doc)
    return doc

In [375]:
data['reviews'] = data['reviews'].apply(lambda x:regex_clean(x))

### We are going to use Tfidf for feature extraction. TfIdf score is the score that calculates the relative importance for each term in the vocabulary. The idea behind tfidf score is that the words stop words like "the", "is", "a" etc are more frequent in the vocabulary, therefore these words should have lesser importance as compared to "awesome", "amazing", "awful" etc.

In [376]:
tfidf = TfidfVectorizer(analyzer='word')
tfidf.fit(data['reviews'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [386]:
word_list = ['the', 'and', 'not', 'good', 'bad', 'amazing', 'awful', 'awesome']
for word in word_list:
    print("idf value for '{}' is {}".format(word, tfidf.idf_[tfidf.vocabulary_[word]]))

idf value for 'the' is 1.8100142746060173
idf value for 'and' is 2.158695570864004
idf value for 'not' is 3.0278150805392454
idf value for 'good' is 3.6130732990880055
idf value for 'bad' is 4.495841338923517
idf value for 'amazing' is 5.451352783950954
idf value for 'awful' is 5.915658392082051
idf value for 'awesome' is 5.962178407716944


In [378]:
train, test = train_test_split(data, test_size=0.2)

In [379]:
X_train, Y_train = list(train['reviews']), list(train['labels'])
X_test, Y_test = list(test['reviews']), list(test['labels'])

In [380]:
tfidf_X_train = tfidf.transform(X_train)
tfidf_X_test = tfidf.transform(X_test)

In [381]:
clf = LinearSVC()
clf.fit(tfidf_X_train, Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [382]:
preds = clf.predict(tfidf_X_test)
accuracy_score(Y_test, preds)

0.8066666666666666

In [384]:
df = pd.DataFrame({'sample':X_test, "label":Y_test, "predicition": preds})
df.sample(n=10)

Unnamed: 0,sample,label,predicition
109,my salad had a bland vinegrette on the baby gr...,0,0
431,they have great dinners.,1,1
9,have read other reviews here but i haven't had...,1,1
27,the story which was told so eloquently by fran...,1,1
421,the keyboard is really worthwhile in usefulnes...,1,1
409,it a shame to see good actors like thomerson a...,0,1
144,you'd have to have the iq of particularly stup...,0,0
274,"for about 10 minutes, we we're waiting for her...",0,0
271,the incredible soundtrack truly captures the e...,1,1
244,the attractive set used throughout most of the...,1,1


In [385]:
clf.predict(tfidf.transform(["The movie is bad"]))

array([0])

In [207]:
clf.predict(tfidf.transform(["The movie is not bad"]))

array([0])