# Text Classification with SVM

In [1]:
from sklearn.datasets import load_files
movie_reviews_data_folder = 'movie_reviews/txt_sentoken'
dataset = load_files(movie_reviews_data_folder, shuffle=False)

Labels

In [2]:
dataset.target_names

['neg', 'pos']

Training data

In [3]:
dataset.filenames

array(['movie_reviews/txt_sentoken/neg/cv000_29416.txt',
       'movie_reviews/txt_sentoken/neg/cv001_19502.txt',
       'movie_reviews/txt_sentoken/neg/cv002_17424.txt', ...,
       'movie_reviews/txt_sentoken/pos/cv997_5046.txt',
       'movie_reviews/txt_sentoken/pos/cv998_14111.txt',
       'movie_reviews/txt_sentoken/pos/cv999_13106.txt'], 
      dtype='<U46')

In [4]:
len(dataset.data)

2000

Split data into train data and test data

In [5]:
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=None)

TASK: Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC, NuSVC

# your code here ...

In [9]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
    ('clf', LinearSVC(C=1000)),
])

TASK: Build a grid search to find out whether unigrams or bigrams are more useful.

Fit the pipeline on the training set using grid search for the parameters

In [10]:
from sklearn.model_selection import GridSearchCV

# your code here ...

In [11]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Print the mean and std for each candidate along with the parameter settings for all the candidates explored by grid search.

In [12]:
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
    print(i, 'params - %s; mean - %0.2f; std - %0.2f'
             % (grid_search.cv_results_['params'][i],
                grid_search.cv_results_['mean_test_score'][i],
                grid_search.cv_results_['std_test_score'][i]))

0 params - {'vect__ngram_range': (1, 1)}; mean - 0.84; std - 0.01
1 params - {'vect__ngram_range': (1, 2)}; mean - 0.85; std - 0.02


Predict the outcome on the testing set and store it in a variable named y_predicted

In [13]:
y_predicted = grid_search.predict(docs_test)

#### Print the classification report

In [14]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names))

             precision    recall  f1-score   support

        neg       0.87      0.88      0.87       243
        pos       0.89      0.87      0.88       257

avg / total       0.88      0.88      0.88       500



#### Print and plot the confusion matrix

In [15]:
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[214  29]
 [ 33 224]]
