In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import random
import os
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
%run ./Combine_csv.ipynb

In [3]:
import datetime

now = datetime.datetime.now()


Today_time = now.strftime("%H:%M")

Today_date = now.strftime("%Y-%m-%d")

In [4]:
prefix_dir ='Combined CSV/'
suffix_dir = 'combined_education-'+Today_date+'.csv'
today_csv=os.path.join(prefix_dir+suffix_dir)

In [5]:
education = pd.read_csv(today_csv)

In [6]:
education.head()

Unnamed: 0,Source,Heading,Category,Date,Time,URL
0,Hindiustan Times,ICAR AIEEA 2019 admit card to be released by N...,news,2019-06-17,14:16:00,https://www.hindustantimes.com/education/icar-...
1,Hindiustan Times,UGC move to thwart ‘pay and publish trash’ cul...,news,2019-06-17,13:03:00,https://www.hindustantimes.com/education/ugc-m...
2,Hindiustan Times,NEST Result 2019 to be declared today at neste...,news,2019-06-17,12:42:00,https://www.hindustantimes.com/education/nest-...
3,Hindiustan Times,Government gives 3-month grant to 28 DU colleges,news,2019-06-17,12:08:00,https://www.hindustantimes.com/education/gover...
4,Hindiustan Times,Lack of BSc courses in evening colleges disapp...,news,2019-06-17,12:06:00,https://www.hindustantimes.com/education/lack-...


In [7]:
education.shape

(577, 6)

In [8]:
df=education

Heading = df['Heading']

In [9]:
df['Category'].value_counts()

news                      104
jobs-and-careers           89
exams result news          43
entrance exams             43
board exams                43
admission news             43
study-abroad               24
grammar-and-vocabulary     24
notification               24
sample-papers              24
tips-and-tricks            24
worldly-science            24
government-jobs            24
colleges                   22
schools                    22
Name: Category, dtype: int64

### Tokenizing and Stemming

In [10]:
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re


In [11]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\drago\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\drago\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
stopset = set(stopwords.words('english'))

In [13]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [14]:
df['Heading'] = df['Heading'].apply(remove_apostrophe)
df['Heading'] = df['Heading'].apply(remove_punctuation)
df['Heading'] = df['Heading'].apply(convert_numbers)

In [15]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [16]:
def tokenize_and_stem(text):
   
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []

    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):

    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
   
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [17]:
Heading_stemmed = []
Heading_tokenized = []
for i in Heading:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'Heading', tokenize/stem
    Heading_stemmed.extend(allwords_stemmed) #extend the 'Heading_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    Heading_tokenized.extend(allwords_tokenized)

## Algo

In [18]:
Z = df[df['Category']=='news']
X = df.drop(Z.index)

#Y = Y.reset_index().drop(['index'],axis=1)

In [19]:
X_train = X['Heading']
Y_train = X['Category']

In [20]:
Test = Z['Heading']

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(Test)
print(train_vectors.shape, test_vectors.shape)

(473, 2235) (104, 2235)


In [23]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_vectors, Y_train)

In [24]:
predicted = clf.predict(test_vectors)

In [25]:
predicted

array(['jobs-and-careers', 'jobs-and-careers', 'jobs-and-careers',
       'admission news', 'admission news', 'jobs-and-careers',
       'jobs-and-careers', 'admission news', 'jobs-and-careers',
       'jobs-and-careers', 'jobs-and-careers', 'jobs-and-careers',
       'jobs-and-careers', 'admission news', 'jobs-and-careers',
       'admission news', 'jobs-and-careers', 'jobs-and-careers',
       'jobs-and-careers', 'entrance exams', 'entrance exams',
       'jobs-and-careers', 'jobs-and-careers', 'jobs-and-careers',
       'admission news', 'jobs-and-careers', 'jobs-and-careers',
       'exams result news', 'exams result news', 'jobs-and-careers',
       'jobs-and-careers', 'entrance exams', 'jobs-and-careers',
       'jobs-and-careers', 'admission news', 'jobs-and-careers',
       'exams result news', 'admission news', 'entrance exams',
       'exams result news', 'jobs-and-careers', 'jobs-and-careers',
       'jobs-and-careers', 'exams result news', 'jobs-and-careers',
       'jobs-a

In [26]:
Test

0       ICAR AIEEA 2019 admit card to be released by ...
1       UGC move to thwart ‘ pay and publish trash ’ ...
2       NEST Result 2019 to be declared today at nest...
3       Government gives 3 month grant to 28 DU colleges
4       Lack of BSc courses in evening colleges disap...
5       Delhi government schools to invite parents fo...
6       IGNOU to offer Yoga certification course from...
7       Delhi University ’ s revised admission bullet...
8       HSSC Junior Engineers Recruitment 2019 1624 v...
9       IBPS RRB Recruitment 2019 Application begins ...
10      Private schools still a pipe dream for poor i...
11      Yog to be a subject at school level AYUSH sec...
12      Space Education Centre and Innovation Hub ina...
13      DU registrations closed despite high court ’ ...
14      A friend in need Two friends secure second an...
15        DU likely to release first cut off by June end
16      College of Vocational Studies may soon offer ...
17      DU Admissions 2019 HC o