# Articles Recommendation Categorization

Recommending web articles for the learners for different study programs

### 1) Import libraries


In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import time
import re 
import nltk
import string
import pickle
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# split the data
from sklearn.model_selection import train_test_split

# BOW, TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer

# Encoding the target variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## Word Embedding

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


### 2) Data Loading

In [3]:
# Read the dataset from csv file
df = pd.read_json(r'../Data/Articles.json')
df.head()

Unnamed: 0,is_sarcastic,Article
0,0,former versace store clerk sues secret black c...
1,0,roseanne revival catches thorny political mood...
2,1,mom starting fear sons web series closest thin...
3,1,boehner wants wife listen come alternative deb...
4,0,jk rowling wishes snape happy birthday magical...


In [9]:
df.shape

(25966, 2)

### 3) Feature Extraction

In [8]:
# Determine data and target
X = df['Article']
y = df.iloc[:, 0].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [11]:
# Splitting the dataset into the Training, validation and Test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)


#Data Summary
print('Length of the Training dataset: ',len(X_train))
print('Length of the Test dataset: ',len(X_test))

Length of the Training dataset:  19474
Length of the Test dataset:  6492


###### Bag of words (Count Vectorizer)


In [21]:
## Vectorization of data
## Vectorize the data using Bag of words (BOW)

start = time.time()

tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
stop_words = nltk.corpus.stopwords.words("english")
tf_vec = CountVectorizer(tokenizer=tokenizer.tokenize, stop_words=stop_words)

train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
test_features = tf_vec.transform(X_test)

end = time.time()

print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of train_vectors:',train_features.shape)
print('Shape of test_vectors:',test_features.shape)

print('Computational time:',end - start)


Shape of X_train: (19474,)
Shape of X_test: (6492,)
Shape of train_vectors: (19474, 132117)
Shape of test_vectors: (6492, 132117)
Computational time: 12.309767723083496


In [36]:
#  save BOW vectors
with open('../vectors/BOW_train.pkl', 'wb') as handle:
    pickle.dump(train_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../vectors/BOW_test.pkl', 'wb') as handle:
    pickle.dump(test_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

###### TF-IDF

In [27]:
# I will use TF-IDF method to extract the text features.

start = time.time()

tf_vec = TfidfVectorizer(tokenizer=None, stop_words='english', max_df=0.75, max_features=2000, lowercase=False,
                         ngram_range=(1,2), use_idf=False, sublinear_tf=True, min_df=5, norm='l2',
                         encoding='latin-1')


train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
test_features = tf_vec.transform(X_test)

end = time.time()


print('Shape of X_train:',X_train.shape)
print('Shape of X_test:',X_test.shape)
print('Shape of train_vectors:',train_features.shape)
print('Shape of test_vectors:',test_features.shape)

print('Computational time:',end - start)

Shape of X_train: (19474,)
Shape of X_test: (6492,)
Shape of train_vectors: (19474, 2000)
Shape of test_vectors: (6492, 2000)
Computational time: 33.56597566604614


In [24]:
#  save tfidf vectorizer

with open('../vectors/TFIDF_train.pkl', 'wb') as handle:
    pickle.dump(train_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../vectors/TFIDF_test.pkl', 'wb') as handle:
    pickle.dump(test_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [35]:
#  save Target vectors

with open('../vectors/train_label.pkl', 'wb') as handle:
    pickle.dump(y_train, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../vectors/test_label.pkl', 'wb') as handle:
    pickle.dump(y_test, handle, protocol=pickle.HIGHEST_PROTOCOL)    