In [26]:
import pandas 
import numpy as np
import re
import nltk.corpus 
import nltk.stem 
from sklearn import svm
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import KFold 
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import cross_val_score

In [36]:
categories=['rec.sport.baseball', 'talk.religion.misc']
data=fetch_20newsgroups(subset="all",
                        categories=categories,
                        remove=('headers','footers','quotes'),
                        shuffle=True,random_state=1)

data_test =fetch_20newsgroups(subset="test",
                               categories=categories,
                        remove=('headers','footers','quotes'),
                        shuffle=True,random_state=1)
data_train =fetch_20newsgroups(subset="train",
                                categories=categories,
                        remove=('headers','footers','quotes'),
                        shuffle=True,random_state=1)
print("number of doc: %d" % len(data.data))
print("number of categories: %d" %len(data.target_names))
categories=[data.target_names]
i = 0
for cat in range(len(data.target_names)):
    cat_name = data.target_names[cat]
    num_docs = len([d for d in data.target if d == i])
    print("Category %d (%s): %d docs" % (i, cat_name, num_docs))
    i += 1


number of doc: 1622
number of categories: 2
Category 0 (rec.sport.baseball): 994 docs
Category 1 (talk.religion.misc): 628 docs


In [37]:
data_pre = []

# A list of custom stopwords to remove
stopwords = set(nltk.corpus.stopwords.words('english'))

# Regex pattern for special characters

special_char_pattern = re.compile(r"[^\w']|_")

# Regex pattern for numbers
number_pattern = re.compile(r'\b\d+\b')

# Regex pattern to handle strings :

writes_pattern = re.compile(r'^.*writes:$')

# Loop through data do the preprocessing
for j in range(0, len(data.data)):
    lines = data.data[j].lower().split("\n")
    for i in range(0, len(lines)):

        lines[i] = number_pattern.sub(' ', lines[i])
        lines[i] = writes_pattern.sub(' ', lines[i])
        lines[i] = special_char_pattern.sub(' ', lines[i])

        # Remove short words
        lines[i] = ' '.join([w for w in lines[i].split() if len(w) > 2])

        # Remove stopwords
        lines[i] = ' '.join([w for w in lines[i].split() if w not in stopwords])

        # Stem the words
        lines[i] = ' '.join([nltk.stem.snowball.SnowballStemmer("english").stem(w) for w in lines[i].split()])

        # Remove extra spaces, just for beauty
        re.sub('\s\s+', " ", lines[i])

    pre = " ".join(lines)
    data_pre.append(pre)


In [38]:
data_pre

[u'unfortun  ',
 u' point realli thought written pro sdcn anti mediot poster bless certain talent sarcasm bite remark somebodi like instanc lurid overstat obvious intend humili origin poster   scale lift eye look like robert realli serious well  compar perform philli edit outstand run produc everi posit except yet finish frustrat sub level folk ever amount anyth neither squad imho  parallel previous year team year edit style brave oriol  greg mockingbird franklin interraci mix encompass lot lot f67709907 ccit arizona edu mingl race robohen',
 u"  legitim point basebal entertain quarrel peopl find certain style play entertain other regardless win valu person i'm huge fan slug bunt doubt high percentag play get big kick will live bad consequ exchang fun  cours claim galarraga inabl defer gratif hurt team paid walk rbi guy whatev   i'm sure use think true i'm becom convinc way around among player physic abil hit ball real hard patient one one get chanc lot  let break four basic categori h

In [39]:
tokens=[]
for pre in data_pre:
    [tokens.append(w) for w in pre.split()]

print(len(tokens))

num_unique_terms=len(set(tokens))
print(num_unique_terms)

text = nltk.Text(tokens)
fdist1 = nltk.probability.FreqDist(text)
print("Term frequencies:")
for w in fdist1.most_common(10):
    print("{:>3},{}".format(w[1], w[0]))

    

128611
11106
Term frequencies:
907,one
873,would
731,year
716,game
709,god
631,think
628,say
566,like
528,time
520,peopl


In [40]:
vectorizer=TfidfVectorizer(max_df=0.6,max_features=100000, 
             min_df=1,stop_words='english', use_idf=True)

vector=vectorizer.fit_transform(data_pre)
vector.shape

(1622, 10904)

In [7]:
from sklearn.linear_model import SGDClassifier
clf=SGDClassifier(loss='hinge', penalty='l2',
                                 alpha=1e-3, n_iter=5, random_state=50)
clf_svm=clf.fit(vector,data.target)



In [None]:
predict=clf_svm.predict(data_test.data)
np.mean(predict==data_test.target)

In [22]:
kf=KFold(n_splits=10)
for train,test in kf.split(vector):
    print([train,test])


[array([ 1132,  1133,  1134, ..., 11311, 11312, 11313]), array([   0,    1,    2, ..., 1129, 1130, 1131])]
[array([    0,     1,     2, ..., 11311, 11312, 11313]), array([1132, 1133, 1134, ..., 2261, 2262, 2263])]
[array([    0,     1,     2, ..., 11311, 11312, 11313]), array([2264, 2265, 2266, ..., 3393, 3394, 3395])]
[array([    0,     1,     2, ..., 11311, 11312, 11313]), array([3396, 3397, 3398, ..., 4525, 4526, 4527])]
[array([    0,     1,     2, ..., 11311, 11312, 11313]), array([4528, 4529, 4530, ..., 5656, 5657, 5658])]
[array([    0,     1,     2, ..., 11311, 11312, 11313]), array([5659, 5660, 5661, ..., 6787, 6788, 6789])]
[array([    0,     1,     2, ..., 11311, 11312, 11313]), array([6790, 6791, 6792, ..., 7918, 7919, 7920])]
[array([    0,     1,     2, ..., 11311, 11312, 11313]), array([7921, 7922, 7923, ..., 9049, 9050, 9051])]
[array([    0,     1,     2, ..., 11311, 11312, 11313]), array([ 9052,  9053,  9054, ..., 10180, 10181, 10182])]
[array([    0,     1,     2, ..

In [42]:
from sklearn.svm import SVC
clf_svc=SVC()
clf_svc.fit(vector,data.target)
pred=clf_svc.predict(data_test.data) 
np.mean(pred==data_test.target)


ValueError: could not convert string to float: Sounds like Darryl being Darryl, Tommy spending too much time on Slim Fast and needs a pasta fix, and the media being their usual "charming" selves.  Sounds like a New York-like story to me!!  :-)  I 

In [25]:
from sklearn.svm import SVC
# 10-fold cross-validation with Linear SVM model
k=10
clf_svc = SVC(kernel='rbf', gamma=1, C = 10)
print "SVM 10-Cross Validation Score:",
score=cross_val_score(clf_svc, vect, data.target, cv=k, scoring='accuracy')
score

SVM 10-Cross Validation Score:

NameError: name 'vect' is not defined

In [10]:
score.mean()

0.7553404417108619