# Importing Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Importing The Dataset

In [None]:
dataset = pd.read_csv('Page_Content.csv')
labeled_datadset = pd.read_csv('Page Labeld Data.csv')

# Data Cleaning

In [None]:
# creating the labeled training set
df = pd.merge(dataset, labeled_datadset, how = "right", on =['Page URL'], )
df.shape

In [None]:
# removing rows with empty 'Page Content' cells
df = df.dropna(subset = ["Page Content"], how = "all")

In [None]:
df.shape

In [None]:
# labels represented in the labeled training set
sns.countplot(df['Page Purpose'])

# Text Cleaning

In [None]:
# importing text cleaning libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string

In [None]:
# Text cleaning function
def message_cleaning(message):
    test_punc_removed = [char for char in message if char not in string.punctuation ]
    test_punc_removed_join = ''.join(test_punc_removed)
    test_punc_removed_join_nums = re.sub('[^a-zA-Z]+', ' ', test_punc_removed_join)
    test_punc_removed_join_nums_clean = [word for word in test_punc_removed_join_nums.split() if word.lower() not in stopwords.words('english')]
    return test_punc_removed_join_nums_clean
    

In [None]:
# Vectorizing the text
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = message_cleaning)
df_countvectorizer = vectorizer.fit_transform(df['Page Content'])
print(vectorizer.get_feature_names())

In [None]:
print(df_countvectorizer.toarray())

In [None]:
df_countvectorizer.shape

# Splitting Train & Test Sets

In [None]:
# labeling the X and y data
X = df_countvectorizer
y = label

In [None]:
# Spliting the training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
# Training the Niave Bayes classifier
from sklearn.naive_bayes import MultinomialNB
NB_Classifier = MultinomialNB()
NB_Classifier.fit(X_train, y_train)

# Evaluating the Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
y_predict_train = NB_Classifier.predict(X_train)

In [None]:
# confusion matrix to test accuracy of training model
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot = True)

In [None]:
print(classification_report(y_train, y_predict_train))

In [None]:
# confusion matrix to test accuracy of testing model
y_predict_test = NB_Classifier.predict(X_test)
cm2 = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm2, annot = True)

In [None]:
print(classification_report(y_test, y_predict_test))

# Training On The Whole Labeled Dataset

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_Classifier = MultinomialNB()
label = df["Page Purpose"].values

In [None]:
# training the model on the full labeled training set
NB_Classifier.fit(df_countvectorizer, label)

In [None]:
# sample test
testing_sample = ["This school is the best! "]

testing_sample_countvectorizer = vectorizer.transform(testing_sample)
NB_Classifier.predict(testing_sample_countvectorizer)

In [None]:
# loop to predict and assign values to all unlabeled data in the dataset
Page_Purpose = []

for row in dataset['Page Content']:
    testing_sample_countvectorizor = vectorizer.transform([row])
    purpose = [(NB_Classifier.predict(testing_sample_countvectorizor))]
    Page_Purpose.append(purpose) 
    
dataset['Page_Purpose'] = Page_Purpose

In [None]:
dataset

In [None]:
# Saveing predictions to a csv
dataset.to_csv('Page Purpose Remaining Labaled Dataset.csv', index = False)

In [None]:
final_df = pd.read_csv('Page Purpose Remaining Labaled Dataset.csv')

In [None]:
# distribution of final predicted labels
sns.countplot(final_df['Page_Purpose'])