# Usage of a classic dataset :20newsgroups

1. We use this standard to build a classification model on 20 high level topics
2. We use it to predict the topic of each questions. Maybe 2 or 3, certainly with a threshold
3. We add this feature to the input data: 
    * if both newsgroup tags are the same, it means they talk about the same high level subject. 
    * if they are not, it means they don't talk about the same subkect, probably questions are different



In [1]:
# Ugly incantation to make our 'framework' working
import sys
sys.path.insert(0, r'/SAPDevelop/QuoraPairs/BruteForce/Tools')

#import all our small tools (paths, cache, print,zip,excel, pandas, progress,..)
from Tools.all import *

# setup the name of our experiment
# it will be used to store every result in a unique place
EXPERIMENT='newsgroups'
# Do a bit of checks before actually running code
UNITARY_TEST = True
print_alert('You will use environment %s' % EXPERIMENT)

prepare_environnement(EXPERIMENT)
train_dataframe=load_dataframe(CLEAN_TRAINING_DATA)
challenge_dataframe=load_dataframe(CLEAN_CHALLENGE_DATA)
print_section('Untouched input data has been loaded. Training: %d lines Challenge: %d lines' % (len(train_dataframe),len(challenge_dataframe)))


In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers','footers','quotes'),random_state=42)
twenty_test =  fetch_20newsgroups(subset='test' , shuffle=True, remove=('headers','footers','quotes'),random_state=42)

Here are the labels

In [3]:
twenty_train.target_names

[&#39;alt.atheism&#39;,
 &#39;comp.graphics&#39;,
 &#39;comp.os.ms-windows.misc&#39;,
 &#39;comp.sys.ibm.pc.hardware&#39;,
 &#39;comp.sys.mac.hardware&#39;,
 &#39;comp.windows.x&#39;,
 &#39;misc.forsale&#39;,
 &#39;rec.autos&#39;,
 &#39;rec.motorcycles&#39;,
 &#39;rec.sport.baseball&#39;,
 &#39;rec.sport.hockey&#39;,
 &#39;sci.crypt&#39;,
 &#39;sci.electronics&#39;,
 &#39;sci.med&#39;,
 &#39;sci.space&#39;,
 &#39;soc.religion.christian&#39;,
 &#39;talk.politics.guns&#39;,
 &#39;talk.politics.mideast&#39;,
 &#39;talk.politics.misc&#39;,
 &#39;talk.religion.misc&#39;]

A bit of cleaning

In [4]:
import re

train_news = pandas.DataFrame(twenty_train.data,columns=['text'])
train_news['text'] = train_news['text'].apply(lambda t: re.sub('[\n]+',' ',t))
train_news['target'] = twenty_train.target

test_news = pandas.DataFrame(twenty_test.data,columns=['text'])
test_news['text'] = test_news['text'].apply(lambda t: re.sub('[\n]+',' ',t))
test_news['target'] = twenty_test.target


We merge some newsgroups to make them a little bit more generic

In [5]:
MAPPING = {
0:0, #'alt.atheism' -> religion
1:1, #'comp.graphics' -> computers
2:1, #'comp.os.ms-windows.misc' -> computers
3:1, #'comp.sys.ibm.pc.hardware' -> computers
4:1, #'comp.sys.mac.hardware' -> computers
5:1, #'comp.windows.x' -> computers
6:2, #'misc.forsale', -> forsale
7:3, #'rec.autos' -> vehicle
8:3, #'rec.motorcycles', -> vehicle
9:4, #'rec.sport.baseball' -> sport
10:4, #'rec.sport.hockey', -> sport
11:5, #'sci.crypt', -> science
12:5, #'sci.electronics', -> science
13:5, #'sci.med', -> science
14:5, #'sci.space', -> science
15:0, #'soc.religion.christian', ->religion
16:6, # talk.politics.guns',->politics
17:6, #'talk.politics.mideast',->politics
18:6, #'talk.politics.misc',->politics
19:0, #'talk.religion.misc'-> religion
}

NEW_LABELS=[
    'religion', #0
    'computers', #1
    'forsale', #2
    'vehicles', #3
    'sport', #4
    'science', #5
    'politics', #6
]


In [6]:
train_news['new_target']=train_news['target'].apply(lambda k: MAPPING[k])
test_news['new_target']=test_news['target'].apply(lambda k: MAPPING[k])


Define a simple pipeline:
* Count all words
* Generate TfIdf
* build a Multinomial Naive Bayes model

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', MultinomialNB(alpha=0.01))])

text_clf = text_clf.fit(train_news['text'], train_news['new_target'])

In [8]:
import numpy as np
predicted = text_clf.predict(test_news['text'])
np.mean(predicted == test_news['new_target'])

0.8028412108337759

Now, apply this model to our datasets

In [17]:
train_newsgroup_proba_question1 = pandas.DataFrame(data=text_clf.predict_proba(train_dataframe['question1']),columns=['proba_'+k+'_question1' for k in NEW_LABELS])
train_newsgroup_proba_question2 = pandas.DataFrame(data=text_clf.predict_proba(train_dataframe['question2']),columns=['proba_'+k+'_question2' for k in NEW_LABELS])

# Glue the 2 proba dataset
train_newsgroup_proba = pandas.concat([train_newsgroup_proba_question1,train_newsgroup_proba_question2.set_index(train_newsgroup_proba_question1.index)],axis=1)
# save it in global repo
save_global_dataframe(train_newsgroup_proba,'train_newsgroup_proba')
del train_newsgroup_proba_question1
del train_newsgroup_proba_question2


In [18]:
challenge_newsgroup_proba_question1 = pandas.DataFrame(data=text_clf.predict_proba(challenge_dataframe['question1']),columns=['proba_'+k+'_question1' for k in NEW_LABELS])
challenge_newsgroup_proba_question2 = pandas.DataFrame(data=text_clf.predict_proba(challenge_dataframe['question2']),columns=['proba_'+k+'_question2' for k in NEW_LABELS])

# Glue the 2 proba dataset
challenge_newsgroup_proba = pandas.concat([challenge_newsgroup_proba_question1,challenge_newsgroup_proba_question2.set_index(challenge_newsgroup_proba_question1.index)],axis=1)
# save it in global repo
save_global_dataframe(challenge_newsgroup_proba,'challenge_newsgroup_proba')
del challenge_newsgroup_proba_question1
del challenge_newsgroup_proba_question2


In [None]:
train_dataframe['is_duplicate'][train_dataframe['newsgroup1']==train_dataframe['newsgroup2']].count()
challenge_dataframe[challenge_dataframe['newsgroup1']==challenge_dataframe['newsgroup2']].count()


In [None]:
essai = pandas.DataFrame(text_clf.predict_proba(train_dataframe['question1']))


In [None]:
essai