In [76]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

In [30]:
entrenamiento = fetch_20newsgroups(subset='train')


In [32]:
categorias_entrenamiento = entrenamiento.target_names


In [33]:
entrenamiento.data[0:5]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [34]:
entrenamiento.target[0:5]

array([ 7,  4,  4,  1, 14])

In [35]:
categorias_entrenamiento[entrenamiento.target[4]]

'sci.space'

In [36]:
len(entrenamiento.data)

11314

In [118]:
STOP_WORDS = text.ENGLISH_STOP_WORDS.union({'00', '000', '10', '100', '11', '12',
                                            '13', '14', '15', '16', '17', '18', '19', 
                                            '1993', '1993apr15', '20', '21', '22', '23', 
                                            '24', '25', '26', '27', '28', '29', '30', '31',
                                            '32', '33', '34', '35', '40', '41', '50', '80', '93',
                                            'ac','01', '02', '03', '04', '05', '07', '08', 
                                            '1000', '150', '1988', '1990', '1991', '1992', 
                                            '1993apr14', '1993apr16', '1993apr19', '1993apr20', 
                                            '1993apr5', '1993apr6', '1st', '200', '2000', '241', 
                                            '241 9760', '250', '256', '2nd', '300', '36', '37', 
                                            '38', '386', '39', '3d', '3rd', '400', '408', '408 241',
                                            '42', '43', '44', '45', '46', '47', '48', '486', '49', '500',
                                            '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', 
                                            '600', '61', '64', '65', '66', '70', '71', '72', '75', '800',
                                            '81', '85', '86', '88', '89', '90', '91', '92', '95', '9760', '99', '__', '___'})

In [122]:
vectorizer = TfidfVectorizer(stop_words=STOP_WORDS, ngram_range=(1,2), min_df=0.02,max_df = 0.65)
#max_df ignore los terminos que aparecen en más del 65% de los documentos
#min_df  ignore los terminos que aparecen en menos del 1% de los documentos
vectors = vectorizer.fit_transform(entrenamiento.data)

In [123]:
vectors.shape

(11314, 863)

In [124]:
desc_feature_names = vectorizer.get_feature_names()
print(desc_feature_names)

['able', 'accept', 'access', 'according', 'acs', 'act', 'action', 'actually', 'add', 'address', 'advance', 'advice', 'ago', 'agree', 'air', 'al', 'allow', 'allowed', 'america', 'american', 'andrew', 'answer', 'anti', 'anybody', 'apparently', 'appears', 'apple', 'application', 'apply', 'appreciate', 'appreciated', 'apr', 'apr gmt', 'april', 'area', 'aren', 'argument', 'article', 'ask', 'asked', 'asking', 'assume', 'att', 'att com', 'au', 'available', 'average', 'avoid', 'away', 'bad', 'base', 'baseball', 'based', 'basic', 'basically', 'basis', 'bbs', 'believe', 'best', 'better', 'bible', 'big', 'bike', 'bit', 'bitnet', 'black', 'blue', 'board', 'bob', 'body', 'book', 'books', 'bought', 'box', 'break', 'brian', 'bring', 'brought', 'btw', 'build', 'building', 'built', 'bus', 'business', 'buy', 'ca', 'ca lines', 'california', 'called', 'came', 'canada', 'car', 'card', 'cards', 'care', 'carry', 'cars', 'case', 'cases', 'cause', 'cc', 'center', 'certain', 'certainly', 'chance', 'change', 'ch

In [125]:
X_train, X_test, y_train, y_test = train_test_split(vectors,entrenamiento.target,test_size=0.4)

In [126]:
X_train.shape

(6788, 863)

In [127]:
clf = MLPClassifier(hidden_layer_sizes=(863,400,200))

In [128]:
clf.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(863, 400, 200), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [129]:
clf.score(X_test,y_test)

0.7105612019443217

In [130]:
y_pred = clf.predict(X_test)

In [131]:
print(classification_report(y_test,y_pred,target_names=categorias_entrenamiento))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.71      0.71       205
           comp.graphics       0.69      0.50      0.58       237
 comp.os.ms-windows.misc       0.64      0.79      0.71       239
comp.sys.ibm.pc.hardware       0.59      0.49      0.53       236
   comp.sys.mac.hardware       0.64      0.61      0.63       224
          comp.windows.x       0.70      0.71      0.71       238
            misc.forsale       0.80      0.69      0.74       229
               rec.autos       0.65      0.70      0.67       233
         rec.motorcycles       0.79      0.76      0.78       235
      rec.sport.baseball       0.74      0.79      0.77       199
        rec.sport.hockey       0.85      0.80      0.82       270
               sci.crypt       0.88      0.83      0.85       235
         sci.electronics       0.48      0.65      0.55       245
                 sci.med       0.62      0.69      0.65       222
         