# Articles Recommendation Categorization

Recommending web articles for the learners for different study programs

### 1) Import libraries


In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re 
import nltk
import string
import pickle
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split

### 2) Data Loading

In [48]:
# Read the dataset from csv file
df = pd.read_json(r'Data/cleaned_articles.json')
df.head()

Unnamed: 0,body,title,category
0,protecting netflix viewing privacy scale open ...,Protecting Netflix Viewing Privacy at Scale,Engineering
1,introducing winston event driven diagnostic re...,Introducing Winston - Event driven Diagnostic ...,Engineering
2,performance usage instagram instagram treat pe...,Performance & Usage at Instagram,Engineering
3,simple example calculating formatting bill vid...,Refactoring a javascript video store,Engineering
4,billing applications transactions need acid co...,Netflix Billing Migration to AWS - Part III,Engineering


### 3) Feature Extraction

In [49]:
# Determine data and target
X = df['body']
y = df.iloc[:, -1].values
y

array(['Engineering', 'Engineering', 'Engineering', ..., 'Engineering',
       'Product & Design', 'Startups & Business'], dtype=object)

In [50]:
# Encoding the Dependent Variable

encoder = LabelEncoder()
encoder.fit(y_train)
Ytr = encoder.transform(y_train)
Yde = encoder.transform(y_valid)
Yte = encoder.transform(y_test)

In [51]:
# I will use TF-IDF method to extract the text features.

# Use TF-IDF

tf_vec = TfidfVectorizer(tokenizer=None, stop_words=None, max_df=0.75, max_features=2000, lowercase=False,
                         ngram_range=(1,2), use_idf=False, sublinear_tf=True, min_df=5, norm='l2',
                         encoding='latin-1')


train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
val_features = tf_vec.transform(X_valid)
test_features = tf_vec.transform(X_test)


print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of X_val:',X_valid.shape)
print('Shape of train_vectors:',train_features.shape)
print('Shape of test_vectors:',test_features.shape)
print('Shape of val_vectors:',val_features.shape)

Shape of X_train: (1722,)
Shape of X_test: (222,)
Shape of X_val: (517,)
Shape of train_vectors: (1722, 2000)
Shape of test_vectors: (222, 2000)
Shape of val_vectors: (517, 2000)


In [52]:
## Vectorization of data
## Vectorize the data using Bag of words (BOW)

tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
tf_vec = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)

train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
val_features = tf_vec.transform(X_valid)
test_features = tf_vec.transform(X_test)




## 4. Modeling

In [None]:
# Train and evaluate model
def fit_eval_model(model, train_features, y_train, test_features, y_test):
    
    """
    Function: train and evaluate a machine learning classifier.
    Args:
      model: machine learning classifier
      train_features: train data extracted features
      y_train: train data lables
      test_features: train data extracted features
      y_test: train data lables
    Return:
      results(dictionary): a dictionary of the model training time and classification report
    """
    results ={}
    
    # Start time
    start = time.time()
    # Train the model
    model.fit(train_features, y_train)
    # End time
    end = time.time()
    # Calculate the training time
    results['train_time'] = end - start
    
    # Test the model
    train_predicted = model.predict(train_features)
    test_predicted = model.predict(test_features)
    
     # Classification report
    results['classification_report'] = classification_report(y_test, test_predicted)
        
    return results
    
# Initialize the models
sv = svm.SVC()
ab = AdaBoostClassifier(random_state = 1)
gb = GradientBoostingClassifier(random_state = 1)
xgb = xgboost.XGBClassifier(random_state = 1)
tree = DecisionTreeClassifier()
nb = MultinomialNB()


# Fit and evaluate models
results = {}
for cls in [sv, ab, gb, xgb, tree, nb]:
    cls_name = cls.__class__.__name__
    results[cls_name] = {}
    results[cls_name] = fit_eval_model(cls, train_features, y_train, test_features, y_test)
    
# Print classifiers results
for res in results:
    print (res)
    print()
    for i in results[res]:
        print (i, ':')
        print(results[res][i])
        print()
    print ('-----')
    print()

## 5) Use the Model

In [None]:
# Now, I will use the built MultinomialNB model to classify new articles. The articles files inside Articles folder and
# here are the articles sources.

art1: https://edition.cnn.com/2019/09/30/sport/irish-national-stud-winning-post-spt-intl/index.html
art2: https://edition.cnn.com/2020/04/15/tech/amazon-france-suspension/index.html
art3: https://edition.cnn.com/2020/04/15/politics/barack-obama-2020-test/index.html
art4: https://edition.cnn.com/2020/04/15/entertainment/disney-the-mandalorian-documentary/index.html
        

In [None]:
# Classify an article
def classify_article(path):
    
    """
    Function: classify an article.
    Args:
      path: the path of the article 
    Return:
      category (str): the category of the article
    """
    # Read file
    file = open(path, 'r')
    artcl = file.read()

    # Text preprocessing
    artcl = preprocess(artcl)
    artcl = ' '.join(artcl)

    # Use TF_IDF
    test = tf_vec.transform([artcl])

    # Use MultinomialNB model to classify the article
    predict = nb.predict(test)
    category = predict[0]

    # Close file
    file.close()

    return category

In [None]:
print(classify_article('Articles/art1.txt'))
