# Articles Recommendation Categorization

Recommending web articles for the learners for different study programs

### 1) Import libraries


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re 
import nltk
import string
import pickle
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

### 2) Data Loading

In [4]:
# Read the dataset from csv file
df = pd.read_json(r'../Data/cleaned_articles.json')
df.head()

Unnamed: 0,body,title,category
0,protecting netflix viewing privacy scale open ...,Protecting Netflix Viewing Privacy at Scale,Engineering
1,introducing winston event driven diagnostic re...,Introducing Winston - Event driven Diagnostic ...,Engineering
2,performance usage instagram instagram treat pe...,Performance & Usage at Instagram,Engineering
3,simple example calculating formatting bill vid...,Refactoring a javascript video store,Engineering
4,billing applications transactions need acid co...,Netflix Billing Migration to AWS - Part III,Engineering


### 3) Feature Extraction

In [5]:
# Determine data and target
X = df['body']
y = df.iloc[:, -1].values
y

array(['Engineering', 'Engineering', 'Engineering', ..., 'Engineering',
       'Product & Design', 'Startups & Business'], dtype=object)

In [8]:
# Splitting the dataset into the Training, validation and Test sets

X_train, X_res, y_train, y_res = train_test_split(X, y, test_size = 0.2)


X_val, X_test, y_val, y_test = train_test_split(X_res, y_res, test_size = 0.5)

In [10]:
# Encoding the Dependent Variable

encoder = LabelEncoder()
encoder.fit(y_train)
Ytr = encoder.transform(y_train)
Yde = encoder.transform(y_val)
Yte = encoder.transform(y_test)

In [11]:
# I will use TF-IDF method to extract the text features.

# Use TF-IDF

tf_vec = TfidfVectorizer(tokenizer=None, stop_words=None, max_df=0.75, max_features=2000, lowercase=False,
                         ngram_range=(1,2), use_idf=False, sublinear_tf=True, min_df=5, norm='l2',
                         encoding='latin-1')


train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
val_features = tf_vec.transform(X_val)
test_features = tf_vec.transform(X_test)


print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of X_val:',X_val.shape)
print('Shape of train_vectors:',train_features.shape)
print('Shape of test_vectors:',test_features.shape)
print('Shape of val_vectors:',val_features.shape)

Shape of X_train: (1968,)
Shape of X_test: (247,)
Shape of X_val: (246,)
Shape of train_vectors: (1968, 2000)
Shape of test_vectors: (247, 2000)
Shape of val_vectors: (246, 2000)


In [None]:
## Vectorization of data
## Vectorize the data using Bag of words (BOW)

tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
tf_vec = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)

train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
val_features = tf_vec.transform(X_valid)
test_features = tf_vec.transform(X_test)

In [12]:
#  save tfidf vectorizer
with open('../vectors/train_vector.pkl', 'wb') as handle:
    pickle.dump(train_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../vectors/val_vector.pkl', 'wb') as handle:
    pickle.dump(val_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../vectors/test_vector.pkl', 'wb') as handle:
    pickle.dump(test_features, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [13]:
#  save Target Varab

with open('../vectors/train_label.pkl', 'wb') as handle:
    pickle.dump(Ytr, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../vectors/val_label.pkl', 'wb') as handle:
    pickle.dump(Yde, handle, protocol=pickle.HIGHEST_PROTOCOL)    

with open('../vectors/test_label.pkl', 'wb') as handle:
    pickle.dump(Yte, handle, protocol=pickle.HIGHEST_PROTOCOL)    