In [2]:
!pip install scipy
!pip install sklearn
!pip install nltk



In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem.porter import PorterStemmer
from sklearn.cluster import KMeans
import numpy as np

pd.options.display.max_columns = 30
%matplotlib inline

In [4]:
!tar -zxf 20_newsgroups.tar.gz

In [2]:
# glob finds files matching a certain filename pattern
import glob

# Give me all the text files
paths = glob.glob('20_newsgroups/*/*')
paths[:20]

['20_newsgroups/alt.atheism/0000000',
 '20_newsgroups/alt.atheism/0000001',
 '20_newsgroups/alt.atheism/0000002',
 '20_newsgroups/alt.atheism/0000003',
 '20_newsgroups/alt.atheism/0000004',
 '20_newsgroups/alt.atheism/0000005',
 '20_newsgroups/alt.atheism/0000006',
 '20_newsgroups/alt.atheism/0000007',
 '20_newsgroups/alt.atheism/0000008',
 '20_newsgroups/alt.atheism/0000009',
 '20_newsgroups/alt.atheism/0000010',
 '20_newsgroups/alt.atheism/0000011',
 '20_newsgroups/alt.atheism/0000012',
 '20_newsgroups/alt.atheism/0000013',
 '20_newsgroups/alt.atheism/0000014',
 '20_newsgroups/alt.atheism/0000015',
 '20_newsgroups/alt.atheism/0000016',
 '20_newsgroups/alt.atheism/0000017',
 '20_newsgroups/alt.atheism/0000018',
 '20_newsgroups/alt.atheism/0000019']

In [3]:
newsgroups = []
for folder in paths:
    with open(folder, encoding = "latin-1") as ng_file:
        ng = {
            'pathname': folder,
            'filename': folder.split('/')[-1],
            'group': folder.split('/')[-2],
            'content': ng_file.read()
        }
    newsgroups.append(ng)
newsgroups_df = pd.DataFrame(newsgroups)
newsgroups_df.head()

Unnamed: 0,content,filename,group,pathname
0,Archive-name: atheism/resources\nAlt-atheism-a...,0,alt.atheism,20_newsgroups/alt.atheism/0000000
1,Archive-name: atheism/introduction\nAlt-atheis...,1,alt.atheism,20_newsgroups/alt.atheism/0000001
2,In article <65974@mimsy.umd.edu>\nmangoe@cs.um...,2,alt.atheism,20_newsgroups/alt.atheism/0000002
3,dmn@kepler.unh.edu (...until kings become phil...,3,alt.atheism,20_newsgroups/alt.atheism/0000003
4,In article <N4HY.93Apr5120934@harder.ccr-p.ida...,4,alt.atheism,20_newsgroups/alt.atheism/0000004


## 2. Use the LabelEncoder to convert the group names to numeric labels 

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [5]:
le.fit(newsgroups_df['group'])

LabelEncoder()

In [6]:
le.transform(newsgroups_df['group'])

array([ 0,  0,  0, ..., 19, 19, 19])

In [7]:
newsgroups_df['newsgroups_label'] = le.transform(newsgroups_df['group'])
newsgroups_df.head(3)
newsgroups_df.tail(3)

Unnamed: 0,content,filename,group,pathname,newsgroups_label
20414,In article <1rc1f3INN7rl@emx.cc.utexas.edu> \n...,19994,talk.religion.misc,20_newsgroups/talk.religion.misc/0019994,19
20415,In article <1993Apr26.231845.13843@digi.lonest...,19995,talk.religion.misc,20_newsgroups/talk.religion.misc/0019995,19
20416,In article <C64H4w.BFH@darkside.osrhe.uoknor.e...,19996,talk.religion.misc,20_newsgroups/talk.religion.misc/0019996,19


## 3. Pick out 10 words or phrases to use as manually created features. Doing an 80/20 train/test split, how well does a Naive Bayes classifier do?

atheism
graphics
motorcycles
baseball
hockey
crypt
med
space
guns
mideast

In [8]:
newsgroups_df['has_atheism'] = newsgroups_df['content'].str.contains('atheism')
newsgroups_df['has_graphics'] = newsgroups_df['content'].str.contains('graphics')
newsgroups_df['has_motorcycles'] = newsgroups_df['content'].str.contains('motocycles')
newsgroups_df['has_baseball'] = newsgroups_df['content'].str.contains('baseball')
newsgroups_df['has_hockey'] = newsgroups_df['content'].str.contains('hockey')
newsgroups_df['has_crypt'] = newsgroups_df['content'].str.contains('crypt')
newsgroups_df['has_med'] = newsgroups_df['content'].str.contains('med')
newsgroups_df['has_space'] = newsgroups_df['content'].str.contains('space')
newsgroups_df['has_guns'] = newsgroups_df['content'].str.contains('guns')
newsgroups_df['has_mideast'] = newsgroups_df['content'].str.contains('mideast')

In [9]:
newsgroups_df.columns

Index(['content', 'filename', 'group', 'pathname', 'newsgroups_label',
       'has_atheism', 'has_graphics', 'has_motorcycles', 'has_baseball',
       'has_hockey', 'has_crypt', 'has_med', 'has_space', 'has_guns',
       'has_mideast'],
      dtype='object')

In [10]:
from sklearn.cross_validation import train_test_split

nbx_train, nbx_test, nby_train, nby_test = train_test_split(
    newsgroups_df[['has_atheism', 'has_graphics', 'has_motorcycles', 'has_baseball',
       'has_hockey', 'has_crypt', 'has_med', 'has_space', 'has_guns',
       'has_mideast']], # the first is our FEATURES
    newsgroups_df['newsgroups_label'], # the second parameter is the LABEL (0-16, southern us, brazilian, anything really)
    test_size=0.2) # 80% training, 20% testing

In [11]:
from sklearn import naive_bayes

clf = naive_bayes.BernoulliNB()
clf.fit(nbx_train, nby_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [12]:
clf.score(nbx_train, nby_train)

0.1730851650033674

In [13]:
clf.score(nbx_test, nby_test)

0.16307541625857003

## 4. Use a CountVectorizer to automatically create your list of features. Doing an 80/20 train/test split, how well can a Naive Bayes classifier do?

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vectorizer = CountVectorizer()
#kernel dies without max features

In [16]:
vectorizer.fit(newsgroups_df['content'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [30]:
all_word_features = vectorizer.transform(newsgroups_df['content'])
all_word_features

<20417x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 1542471 stored elements in Compressed Sparse Row format>

In [31]:
x_train, x_test, y_train, y_test = train_test_split(
    all_word_features,
    newsgroups_df['content'], 
    test_size=0.2)

In [32]:
clf = naive_bayes.BernoulliNB()
clf.fit(x_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [33]:
clf.score(x_train, y_train)

0.1223290271230025

In [35]:
clf.score(x_test, y_test)

0.021302644466209598

In [36]:
# So Naive Bayes did better with the manually selected features.

## 5. PUSH THAT SCORE UP! You can adjust ngrams, max_features and any other options of the vectorizer, or try a decision tree or any other type of classifier.

bc I couldn't run the Count Vectorizer without max_features and ngrams don't make sense here, bc it is more about single words(?), I would like to try Random Forests

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
tree_clf = RandomForestClassifier()

In [45]:
tree_clf.fit(nbx_train, nby_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [46]:
print("Training score:", tree_clf.score(nbx_train, nby_train))

print("Testing score:", tree_clf.score(nbx_test, nby_test))

Training score: 0.174003551093
Testing score: 0.165034280118


In [None]:
#Same results for Naive Bayes and Random forest?

## 6. Write 15 sentences that, when run against the predictor, are put in 15 separate newsgroups (list the names of the newsgroups).

In [47]:
sentences = [
    "I believe atheism is also a religion",
    "I love graphics",
    "We rode a motorcycle on the weekend",
    "I never played or watched baseball",
    "Who likes hockey, really?!"
    "You would want to encrypt your emails",
    "I really love the pharma industry for all the medicine provided",
    "How does medical care in space work out?",
    "Do you have a collection of guns?",
    "What's your favourite country in the mideast?",
    "The US should pass a anti-gun law to control weapon sales",
    "You should teach how to crypt in schools",
    "Have you ever fired guns?",
    "These are really cool graphics. How did you make them?",
    "Mideast politics are really complicated."
]

In [69]:
sentences_words_features = vectorizer.transform(sentences)

In [70]:
sentences_words_features

<14x74 sparse matrix of type '<class 'numpy.int64'>'
	with 98 stored elements in Compressed Sparse Row format>

In [72]:
clf = naive_bayes.BernoulliNB()

In [77]:
clf.fit(vectorizer.transform(newsgroups_df['content']), newsgroups_df['newsgroups_label'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [78]:
predictions = clf.predict(sentences_words_features)
predictions

array([ 6,  6,  8,  6, 10,  6,  1,  6,  6, 16,  6,  6,  1,  6])

In [79]:
le.inverse_transform(predictions)

array(['misc.forsale', 'misc.forsale', 'rec.motorcycles', 'misc.forsale',
       'rec.sport.hockey', 'misc.forsale', 'comp.graphics', 'misc.forsale',
       'misc.forsale', 'talk.politics.guns', 'misc.forsale',
       'misc.forsale', 'comp.graphics', 'misc.forsale'], dtype=object)

In [None]:
# This is predicting badly, but it is also with the training data, so it is consistent in a way. 
# I'd assume that this is a case for improving training data, to aslo improve the resule 
# (ie go for word combinations instead of single words.)