In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Text Classification

This is the data from News Articales and their topics,
our task is to make a text classifier to detect the topic of an news articale.

In [2]:
df = pd.read_csv('News_Articales.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,Text,Target
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,rec.sport.hockey
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,comp.sys.ibm.pc.hardware
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\r\nSubje...,talk.politics.mideast
3,From: guyd@austin.ibm.com (Guy Dawson)\r\nSubj...,comp.sys.ibm.pc.hardware
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,comp.sys.mac.hardware


In [4]:
print(df.sample()['Text'].iloc[0])

From: bethd@netcom.com (Beth Dixon)
Subject: Re: Ducati 400 opinions wanted
Organization: Netcom Online Communications Services (408-241-9760 login: guest)
Lines: 42

In article <C5I2HM.JwC@srgenprp.sr.hp.com> frankb@sad.hp.com (Frank Ball) writes:
>Godfrey DiGiorgi (ramarren@apple.com) wrote:
>& 
>& The Ducati 400 model is essentially a reduced displacement 750, which
>& means it weighs the same and is the same size as the 750 with far less
>& power. It is produced specifically to meet a vehicle tax restriction
>
>The Ducati 750 model is essentially a reduced displacement 900, which
>means it weighs the same and is the same size as the 900 with far less
>power.  And less brakes.
Bzzzt.  The 750SS is 40 pounds lighter than the 900SS.  I personally,
and recently, witnessed my 750SS do a stoppie with a larger-than-
average rider aboard.  He said it took two fingers on that measely
single front disk to accomplish the task.  How much more brake do
you need?

>As such, 

## TF-IDF

Make a pipeline using `TF-IDF` and `LogisticRegression`.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [6]:
df['Target'].value_counts()

rec.sport.hockey            999
soc.religion.christian      997
rec.motorcycles             996
rec.sport.baseball          994
sci.crypt                   991
sci.med                     990
rec.autos                   990
comp.windows.x              988
sci.space                   987
comp.os.ms-windows.misc     985
sci.electronics             984
comp.sys.ibm.pc.hardware    982
misc.forsale                975
comp.graphics               973
comp.sys.mac.hardware       963
talk.politics.mideast       940
talk.politics.guns          910
alt.atheism                 799
talk.politics.misc          775
talk.religion.misc          628
Name: Target, dtype: int64

In [7]:
X = df['Text']
y = df['Target'].map({'rec.sport.hockey':0, 
                      'rec.sport.baseball':1,
                     'rec.motorcycles':2,
                     'rec.autos':3,
                     'sci.crypt':4,
                     'sci.med':5,
                     'sci.space':6,
                     'sci.electronics':7,
                     'comp.windows.x':8,
                     'comp.os.ms-windows.misc':9,
                     'comp.sys.ibm.pc.hardware':10,
                     'comp.graphics':11,
                     'comp.sys.mac.hardware':12,
                     'misc.forsale':13,
                     'talk.politics.mideast':14,
                     'talk.politics.guns':15,
                     'talk.politics.misc':16,
                     'talk.religion.misc':17,
                     'soc.religion.christian':18,
                     'alt.atheism':19})

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train.shape

(15076,)

In [11]:
steps = [('tfidf', TfidfVectorizer(lowercase=True)),
        ('clf', LogisticRegression())]

tf_lr = Pipeline(steps)
tf_lr.fit(X_train, y_train)
train_predictions = tf_lr.predict(X_train)
test_predictions = tf_lr.predict(X_test)

print("TRAIN:")
print(classification_report(y_train, train_predictions))

print("TEST:")
print(classification_report(y_test, test_predictions))

TRAIN:
             precision    recall  f1-score   support

          0       0.99      0.99      0.99       801
          1       1.00      0.99      0.99       783
          2       1.00      0.99      0.99       828
          3       0.98      0.97      0.98       794
          4       0.99      0.98      0.98       790
          5       0.99      0.99      0.99       796
          6       0.98      0.98      0.98       798
          7       0.96      0.97      0.96       782
          8       0.97      0.96      0.97       773
          9       0.93      0.95      0.94       790
         10       0.93      0.94      0.94       799
         11       0.93      0.95      0.94       771
         12       0.98      0.97      0.97       758
         13       0.91      0.96      0.93       782
         14       0.98      0.99      0.99       758
         15       0.95      0.98      0.97       722
         16       0.98      0.94      0.96       616
         17       0.98      0.78      

## GridSearch

Now grid search through hyper paramters of these 2 models and save the best estimator.

In [12]:
params = {"tfidf__stop_words":[None, 'english'],
          "tfidf__max_df":[1.0, .8],
          "tfidf__max_features":[None, 100, 500],
          "tfidf__sublinear_tf":[True, False],
          "clf__penalty": ['l2', 'l1'],
          "clf__C": [0.1, 1.0]}

In [14]:
gs_pipe = GridSearchCV(tf_lr, params, scoring='f1_macro',cv=2, verbose=1)

gs_pipe.fit(X_train, y_train)

Fitting 2 folds for each of 96 candidates, totalling 192 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed: 23.2min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tfidf__stop_words': [None, 'english'], 'tfidf__max_df': [1.0, 0.8], 'tfidf__max_features': [None, 100, 500], 'tfidf__sublinear_tf': [True, False], 'clf__penalty': ['l2', 'l1'], 'clf__C': [0.1, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=1)

In [15]:
train_predictions = gs_pipe.predict(X_train)
test_predictions = gs_pipe.predict(X_test)

print("TRAIN:")
print(classification_report(y_train, train_predictions))

print("TEST:")
print(classification_report(y_test, test_predictions))

TRAIN:
             precision    recall  f1-score   support

          0       0.99      1.00      0.99       801
          1       0.99      0.99      0.99       783
          2       1.00      0.99      0.99       828
          3       0.98      0.98      0.98       794
          4       0.99      0.98      0.99       790
          5       0.99      0.99      0.99       796
          6       0.99      0.99      0.99       798
          7       0.97      0.97      0.97       782
          8       0.99      0.97      0.98       773
          9       0.94      0.96      0.95       790
         10       0.93      0.95      0.94       799
         11       0.95      0.96      0.95       771
         12       0.98      0.98      0.98       758
         13       0.93      0.96      0.95       782
         14       0.99      1.00      0.99       758
         15       0.96      0.99      0.98       722
         16       0.99      0.95      0.97       616
         17       0.99      0.85      

In [16]:
gs_pipe.best_params_

{'clf__C': 1.0,
 'clf__penalty': 'l2',
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__stop_words': 'english',
 'tfidf__sublinear_tf': True}