## 3.3 - Binary Classification - predicting a label (out of two)

#### Get the data

`$ python -m nltk.downloader movie_reviews`

In [None]:
from nltk.corpus import movie_reviews as data

In [None]:
data.categories()

In [None]:
data.fileids()

In [None]:
data.raw('neg/cv029_19943.txt')

In [None]:
len(data.fileids('pos')), len(data.fileids('neg'))

#### Preparing the data

In [None]:
corpus = [data.raw(fileid) for fileid in data.fileids('pos')]
corpus += [data.raw(fileid) for fileid in data.fileids('neg')]

target = ['pos'] * 1000  # ['pos', 'pos', ... x1000]
target += ['neg'] * 1000

TF = Term Frequency

IDF = Inverse Document Frequency

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=5, max_df=0.8)

X = vectorizer.fit_transform(corpus)

In [None]:
X.shape

#### First Attempt at Classification

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, target, test_size=0.2, random_state=0)

In [None]:
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)

#### Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

print("Precision: {}".format(precision_score(Y_test, Y_pred, average='macro')))
print("Recall: {}".format(recall_score(Y_test, Y_pred, average='macro')))
print("F1-Score: {}".format(f1_score(Y_test, Y_pred, average='macro')))

In [None]:
print(classification_report(Y_test, Y_pred, digits=4))

#### Cross-validation

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

cross_val_score(classifier, X, target, cv=cv, scoring='f1_macro')

In [None]:
cross_val_score(classifier, X, target, cv=cv, scoring='f1_macro').mean()

In [None]:
cross_val_score(classifier, X, target, cv=cv, scoring='precision_macro').mean()

In [None]:
cross_val_score(classifier, X, target, cv=cv, scoring='recall_macro').mean()

In [None]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=10, shuffle=True, random_state=0)

cross_val_score(classifier, X, target, cv=cv, scoring='f1_macro')

In [None]:
cross_val_score(classifier, X, target, cv=cv, scoring='f1_macro').mean()