In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('ag_news_train.csv')
df_test = pd.read_csv('ag_news_test.csv')

In [3]:
df_train['text'] = df_train['Title'] + ' ' + df_train['Description']
df_train.head()

Unnamed: 0,Class Index,Title,Description,text
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli...",Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco...","Oil prices soar to all-time record, posing new..."


## Text Feature Representation  


In [None]:
vectorizer = CountVectorizer()

In [None]:
texts = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
X = vectorizer.fit_transform(texts)
X

In [None]:
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.")

In [None]:
print(vectorizer.get_feature_names_out())

X.toarray()

In [None]:
vectorizer.vocabulary_.get('first')

In [None]:
vectorizer.transform(['Something completely new.']).toarray()

## Tf-idf   
- In a large text corpus, some words will be very present (e.g. “the”, “a”, “is” in English) hence carrying very little meaningful information about the actual contents of the document
- In order to re-weight the count features into floating point values suitable for usage by a classifier it is very common to use the tf–idf transform
- Tf means term-frequency while tf–idf means term-frequency times inverse document-frequency </br>
    tf-idf(t, d) = tf(t, d) * idf(t)

In [None]:
normalizer = TfidfTransformer()
X = normalizer.fit_transform(X)

In [None]:
X.toarray()

In [None]:
normalizer = TfidfVectorizer()
X = normalizer.fit_transform(texts)

In [None]:
X.toarray()

In [None]:
normalizer.get_feature_names_out()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train['text'], df_train['Class Index'], test_size = 70000)

In [None]:
normalizer = TfidfVectorizer()
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

## 2 Model Training
### 2.1 Logistic Regression

In [None]:
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

### 2.2 KNN

In [None]:
parameters = {'n_neighbors': [1, 5, 9, 13, 17, 21]}

In [None]:
clf = GridSearchCV(KNeighborsClassifier(), parameters, verbose=5)

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

### 2.3 Decision Tree

In [None]:
parameters = {'min_samples_split': [2, 10, 20, 50],
              'max_features': [100, 1000, 10000, 50000]
             }
clf = GridSearchCV(DecisionTreeClassifier(), parameters, verbose = 5)
clf = clf.fit(X_train, y_train)

In [None]:
print(clf.best_estimator_)
print(clf.best_params_)
print(clf.best_score_)

In [None]:
## Default Decision Tree
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [None]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

### 2.4 SVM

In [None]:
normalizer = TfidfVectorizer(max_features = 2000)
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

In [None]:
parameters = {'C': [10, 100, 500, 1000],
              'gamma': [0.001, 0.01, 0.1, 1, 10]
             }
clf = GridSearchCV(SVC(), parameters, verbose = 5)
clf = clf.fit(X_train, y_train)

### 2.5 Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train['text'], df_train['Class Index'], test_size = .3)
normalizer = TfidfVectorizer(max_features = 20000)
X_train = normalizer.fit_transform(X_train)
X_test = normalizer.transform(X_test)

In [None]:
parameters = {'n_estimators': [10, 50, 100, 200],
              'min_samples_split': [2, 10, 50],
              'max_features': ["sqrt", 10, 100, 200]
             }
clf = GridSearchCV(RandomForestClassifier(), parameters, verbose = 5)
clf = clf.fit(X_train, y_train)

### 2.6 Parallel Processing

In [None]:
parameters = {'n_estimators': [10, 50, 100, 200],
              'min_samples_split': [2, 10, 50],
              'max_features': ["sqrt", 10, 100, 200]
             }
clf = GridSearchCV(RandomForestClassifier(), parameters, verbose = 5, n_jobs = -1)
clf = clf.fit(X_train, y_train)

## 3. Using Pipeline  
- Pipeline allows you to sequentially apply a list of transformers to preprocess the data and, if desired, conclude the sequence with a final predictor for predictive modeling
- Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods
- The final estimator only needs to implement fit
- The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train['text'], df_train['Class Index'], test_size = .80)
clf = Pipeline([('vectorizer', TfidfVectorizer(max_features = 1000)),
                ('classifier', RandomForestClassifier())
               ])

In [None]:
clf = clf.fit(X_train, y_train)

In [None]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

### 3.1 Grid Search on pipeline

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train['text'], df_train['Class Index'], test_size = .80)
clf = Pipeline([('vectorizer', TfidfVectorizer(max_features = 1000, )),
                ('classifier', RandomForestClassifier())
               ])


param_grid = [{
    'vectorizer__max_features': [500, 1000],
    'classifier__n_estimators': [20, 50, 100],
    'classifier__min_samples_split': [2, 10, 20]
}, {
    'vectorizer__max_features': [500, 1000],
    'vectorizer__ngram_range': [(1, 2)],
    'classifier__n_estimators': [20, 50, 100],
    'classifier__min_samples_split': [2, 10, 20]
}]

In [None]:
grid_search = GridSearchCV(clf, param_grid, verbose = 5)
grid_search = grid_search.fit(X_train, y_train)

## 4. Using model in production

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df_train['text'], df_train['Class Index'], test_size = .80)
clf = Pipeline([('vectorizer', TfidfVectorizer(max_features = 1000)),
                ('classifier', RandomForestClassifier())
               ])
clf = clf.fit(X_train, y_train)

In [5]:
import joblib

In [6]:
joblib.dump(clf, 'rf_model')

['rf_model']

In [7]:
model = joblib.load('rf_model')
df_test['text'] = df_test['Title'] + ' ' + df_test['Description']
model.predict(df_test['text'])

array([1, 4, 4, ..., 2, 4, 4], dtype=int64)