## TF-IDF - Term frequency - Inverse Document Frequency, i.e. tf-idf = tf * idf ##
TF-IDF enables us to gives us a way to associate each word in a document with a number that represents how relevant each word is in that document.
`
t is the term/ word
d is the document
D is the total number of documents
{ d ∈ D : t ∈ d } denotes the number of documents in which t occur

tf-idf = tf * idf

Term Frequency = count(t, d) i.e count of term t in document d
Normalized term frequency = count(t,d)/Total terms in that document.
Logarithmic Term Frequency = 1 + log10(count(t,d))
idf ( t, d ) = log ( D / { d ∈ D : t ∈ d })
`

In [8]:
# import modules
import pandas as pd #pandas to deal with tabular data
import numpy as np #numpy for number crunching
from sklearn import metrics #sklearn provides different ml models & methods to prepare training and test data
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import re
import gensim
import nltk
from nltk.corpus import stopwords
import joblib 

nltk.download('stopwords')
stops = set(stopwords.words("english"))


ImportError: cannot import name 'joblib' from 'sklearn.externals' (C:\Users\yadava\Miniconda3\lib\site-packages\sklearn\externals\__init__.py)

In [2]:
# load data from csv files
# Dataset used ImDb Movie Reviews Dataset https://www.kaggle.com/mantri7/imdb-movie-reviews-dataset?select=train_data+%281%29.csv
# Data shape 
# ['user-review', 'label(0-bad, 1-good)']
train_data_from_csv = pd.read_csv('train_data.csv')
test_data_from_csv = pd.read_csv('test_data.csv')


X_train = train_data_from_csv['0']
y_train = train_data_from_csv['1']
X_test = test_data_from_csv['0']
y_test = test_data_from_csv['1']

# preprocess the data
#convert to lower
X_train = X_train.apply(lambda review: review.lower())
X_test = X_test.apply(lambda review: review.lower()) 
# Removing non ASCII chars
X_train = X_train.apply(lambda review: re.sub(r'[^\x00-\x7f]',r' ',review))
X_test = X_test.apply(lambda review: re.sub(r'[^\x00-\x7f]',r' ',review))
# Strip multiple whitespaces
X_train = X_train.apply(lambda review: gensim.corpora.textcorpus.strip_multiple_whitespaces(review))
X_test = X_test.apply(lambda review: gensim.corpora.textcorpus.strip_multiple_whitespaces(review))
# Remove all the stopwords
X_train = X_train.apply(lambda review: " ".join([word for word in review.split() if word not in stops]))
X_test = X_test.apply(lambda review: " ".join([word for word in review.split() if word not in stops]))
# Removing all the tokens with lesser than 2 characters
X_train = X_train.apply(lambda review: " ".join(gensim.corpora.textcorpus.remove_short([word for word in review.split() if word not in stops], minsize=2)))
X_test = X_test.apply(lambda review: " ".join(gensim.corpora.textcorpus.remove_short([word for word in review.split() if word not in stops], minsize=2)))
# Remove the punctuation
X_train = X_train.apply(lambda review: gensim.parsing.preprocessing.strip_punctuation2(review))
X_test = X_test.apply(lambda review: gensim.parsing.preprocessing.strip_punctuation2(review))
# Strip all the numerics
X_train = X_train.apply(lambda review: gensim.parsing.preprocessing.strip_numeric(review))
X_test = X_test.apply(lambda review: gensim.parsing.preprocessing.strip_numeric(review))
# Stemming
X_train = X_train.apply(lambda review: gensim.parsing.preprocessing.stem_text(review))
X_test = X_test.apply(lambda review: gensim.parsing.preprocessing.stem_text(review))
print(X_train.head())


0    film absolut aw nevertheless hilari time altho...
1    well sinc see part s honestli sai never made p...
2    got see film preview dazzl it typic romant com...
3    adapt posit butcher classic belov subtleti tim...
4    zone aw movi simpl seem tri make movi show ree...
Name: 0, dtype: object


In [3]:
# extracting words as features from the training and testing sets and making corresponding feature matrices
tfidf_vectorizer = TfidfVectorizer(analyzer="word", stop_words="english", max_features=1200)
# extract all the unique words and transform is to make term frequency matrix
# we can fit and then trasform but using fit_transform we can do both the steps in single statement
# fit is to extract all the unique words i.e vocabulary
# transform is to make term frequency matrix of the data for all the unique terms extracted from fit part
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
# transform the test data into TF vectorized matrix note dont do fit on X_test again because we dont want do create a new vocabulary instead use
# the same vocabulary we extracted from training data
X_test_tf = tfidf_vectorizer.transform(X_test)

In [4]:
# build svm classification model
svmclf = svm.SVC()
# train model 
svmclf.fit(X_train_tf, y_train)

SVC()

In [6]:
# predict the output from testing data(unseen data)
y_pred = svmclf.predict(X_test_tf)
# find the accuracy of the model
score = metrics.accuracy_score(y_test, y_pred)
print('------------------------------')
print("accuracy:   %0.3f" % score)
print('------------------------------')
print(metrics.classification_report(y_test, y_pred, target_names=['Bad', 'Good']))
print('------------------------------')
print(svmclf.predict(tfidf_vectorizer.transform(["good"])))


------------------------------
accuracy:   0.863
------------------------------
              precision    recall  f1-score   support

         Bad       0.87      0.85      0.86     12500
        Good       0.86      0.87      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

------------------------------
[1]


In [7]:
# save model to file
joblib.dump(svmclf, 'svm_clf.joblib')
# load model from file
model_loded_from_file = joblib.load('svm_clf.joblib')
print(model_loded_from_file.predict(tfidf_vectorizer.transform(["good"])))


ImportError: cannot import name 'joblib' from 'sklearn.externals' (C:\Users\yadava\Miniconda3\lib\site-packages\sklearn\externals\__init__.py)