# Text Classification

### 1. start with famous “20 Newsgoup” data set

In [1]:
# prepare training data

from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [2]:
# take a look at targets
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
twenty_train.data[0].split("\n")[:3]

["From: lerxst@wam.umd.edu (where's my thing)",
 'Subject: WHAT car is this!?',
 'Nntp-Posting-Host: rac3.wam.umd.edu']

In [4]:
# extract features from text files
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 130107)

In [5]:
# tfidf
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [6]:
# train Naive Bayes (NB) classifier
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [7]:
# optimization: use pipeline
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [8]:
# test performance of NB Classifier
import numpy as np

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

In [9]:
# use SGD classifier
from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', max_iter=10, alpha=1e-3, random_state=42))])

text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.8242166755177908

### 2. exploring how to extract features from tvtropes.clusters.txt

In [None]:
tmp = [
    'Professor Philip Brainard Flubber Robin Williams',
    'Professor Keenbean Richie Rich Michael McShane'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tmp)

In [18]:
vectorizer.get_feature_names()

['brainard',
 'flubber',
 'keenbean',
 'mcshane',
 'michael',
 'philip',
 'professor',
 'rich',
 'richie',
 'robin',
 'williams']

In [19]:
X.toarray()

array([[1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1],
       [0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0]], dtype=int64)

### 3. pre-processing tvtropes.clusters.txt

From inspiration in above exploration, I get the idea of how to pre-process the txt file.

In [104]:
fname = "text_data.txt"
with open(fname) as f:
    lines = f.readlines()
lines = [x.strip() for x in lines]

In [105]:
lines[0]

'absent_minded_professor :  Professor Philip Brainard movie Flubber    Robin Williams'

In [106]:
lines[0].split(":")

['absent_minded_professor ',
 '  Professor Philip Brainard movie Flubber    Robin Williams']

In [107]:
features = []
target = []

for line in lines:
    words = line.split(":")
    target.append(words[0])
    features.append(words[1])
    
target = np.asarray(target)
features = np.asarray(features)

In [108]:
target[:10]

array(['absent_minded_professor ', 'absent_minded_professor ',
       'absent_minded_professor ', 'absent_minded_professor ',
       'absent_minded_professor ', 'adventurer_archaeologist ',
       'adventurer_archaeologist ', 'adventurer_archaeologist ',
       'adventurer_archaeologist ', 'arrogant_kungfu_guy '], dtype='<U28')

In [109]:
features[50]

'  Annie Newton movie The Invisible    Margarita Levieva'

In [116]:
len(target)

501

In [117]:
len(features)

501

In [118]:
x_train = []
x_test = []
y_train = []
y_test = []

for i in range(501):
    if i % 10 == 0:
        x_test.append(features[i])
        y_test.append(target[i])
    else:
        x_train.append(features[i])
        y_train.append(features[i])

In [119]:
# use SGD classifier
from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', max_iter=10, alpha=1e-3, random_state=42))])

text_clf_svm = text_clf_svm.fit(x_train, y_train)

In [120]:
y_pred = text_clf_svm.predict(x_test)

In [123]:
y_pred[:5]

array(['  Professor Keenbean movie Richie Rich    Michael McShane',
       "  T. E. Lawrence movie Lawrence of Arabia   Peter O'Toole",
       '  Walter Burns movie His Girl Fray    Cary Grant',
       '  Nick Randall movie Wanted',
       '  Mia movie Stan Helsing    Desi Lydic'], dtype='<U89')

In [124]:
# due to small training dataset, the accuarcy is bad, if we have sufficient training dataset, the performance will improve.
y_test[:5]

['absent_minded_professor ',
 'arrogant_kungfu_guy ',
 'big_man_on_campus ',
 'bounty_hunter ',
 'brainless_beauty ']