# Model 1 - Binary Classification

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# read data
import pandas as pd
import numpy as np
df = pd.read_excel("Model1.xlsx")
df = df.sample(frac=1, random_state=0)
df

Unnamed: 0,Title,Text,Tag
0,10 High Calcium Foods For Pregnancy,"If there’s one thing we can’t stress enough, i...",1
1,10 Green Smoothies For Pregnancy,Yummmm…who doesn’t love a delicious and refres...,1
2,5 Calcium Benefits For Pregnancy,"The pregnant body is amazing, powerful, and dy...",1
3,DHA and Pregnancy,"If there’s anything we know for sure, it’s tha...",1
4,Is A Cup Of Coffee Safe During Pregnancy?,"Understandably, giving up coffee when pregnant...",1
...,...,...,...
818,7 Foods You Should Eat If You're Over 50,"""In youth, it's all about growth and maintaini...",4
819,The 5 Best Foods for Women as They Age,Remember the healthy things your mom did for y...,4
820,"6 Fruits All Women In 20's, 30's, 40's And Bey...",Women are blessed with a dynamic personality. ...,4
821,How nutritional needs for women change with age?,"As children, nutritional needs for boys and gi...",4


## Statistics of Sentences and Words

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
stop_words = set(
    """
a about above across after afterwards again against all almost alone along
already also although always am among amongst amount an and another any anyhow
anyone anything anyway anywhere are around as at
back be became because become becomes becoming been before beforehand behind
being below beside besides between beyond both bottom but by
call can cannot ca could
did do does doing done down due during
each eight either eleven else elsewhere empty enough even ever every
everyone everything everywhere except
few fifteen fifty first five for former formerly forty four from front full
further
get give go
had has have he hence her here hereafter hereby herein hereupon hers herself
him himself his how however hundred
i if in indeed into is it its itself
keep
last latter latterly least less
just
made make many may me meanwhile might mine more moreover most mostly move much
must my myself
name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere
of off often on once one only onto or other others otherwise our ours ourselves
out over own
part per perhaps please put
quite
rather re really regarding
same say see seem seemed seeming seems serious several she should show side
since six sixty so some somehow someone something sometime sometimes somewhere
still such
take ten than that the their them themselves then thence there thereafter
thereby therefore therein thereupon these they third this those though three
through throughout thru thus to together too top toward towards twelve twenty
two
under until up unless upon us used using
various very very via was we well were what whatever when whence whenever where
whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves
""".split()
)

print(len(stop_words))

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

# Callable that can be passed to the Vectorizer 
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`','--']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

In [None]:
# Lemmatize the stop words and store them in a list so that list can be passed to Vectorizer
tokenizer = LemmaTokenizer()
nltk.download('wordnet')
nltk.download('omw-1.4')
# Storing all stop words in a list 
token_stop = tokenizer(' '.join(stop_words))

In [None]:
# 5-fold assign values
fold_values = [1,2,3,4,5] * 164 + [1,2,3]
df["fold"] = fold_values
df.head()

In [None]:
# Distribution by fold and tag
fold_crosstab = pd.crosstab(index=df["Tag"],columns=df["fold"],margins=True)
fold_crosstab

In [None]:
x_train = df[df['fold'] != 1].title_and_text.tolist()
x_test = df[df['fold'] == 1].title_and_text.tolist()
y_train = df[df['fold'] != 1].Tag.tolist()
y_test = df[df['fold'] == 1].Tag.tolist()

print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

## TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# We are excluding words which have very minimal frequency (< 4)
# Tune as needed after receiving feedback
# Computing unigrams and bigrams
# Capping Dimensionality at 1500 features

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=token_stop, 
                                   tokenizer = tokenizer, 
                                   ngram_range = (1,2), 
                                   min_df = 1,
                                   max_features = 1500)

X = tfidf_vectorizer.fit_transform(x_train)
print('Training documents have', len(tfidf_vectorizer.vocabulary_), 'words')
print('X:', X.shape)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

RFC = RandomForestClassifier(n_estimators = 200, 
                             criterion = 'gini', 
                             random_state = 0,
                             n_jobs = - 1)
RFC.fit(X,y_train)

In [None]:
x_test = tfidf_vectorizer.transform(x_test)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

y_pred = RFC.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
pre, recall, f1, support = precision_recall_fscore_support(y_test, y_pred)
print('Precision: ' , pre)
print('Recall:    ' , recall)
print('F1:        ' , f1)

In [None]:
from sklearn.metrics import confusion_matrix
cf_matrix =  confusion_matrix(y_test, y_pred, labels=[1, 4])

# Presenting the Confusion Matrix as a Dataframe
# Horizontal Labels = Predicted Labels
# Vertical Labels   = Ground Truth Labels
pd.DataFrame(cf_matrix, index=["Related", "Unrelated"], columns=["Related", "Unrelated"])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, 
                            labels=[1,4], 
                            digits = 4,
                            target_names=['related', 'unrelated']))