In [1]:
from nltk.tokenize import sent_tokenize 
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.tag import StanfordNERTagger
import os
import json


import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

# Change the following environment varaible and classifier file path to match target computer details.

os.environ['JAVAHOME'] = 'C:/Program Files/Java/jdk1.8.0_60/bin/java.exe'
os.environ['CLASSPATH'] = './stanford-ner.jar'
os.environ['STANFORD_MODELS'] = './stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/ner'
stanford_classifier = './stanford-corenlp-caseless-2015-04-20-models/edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz' 


In [2]:
orig_text = input("Text to process:")

Text to process:A Virat Kohli Tweet That Thrilled Pakistani Fans. Recently, Kohli was trolled for his Teacher's Day post due to the absence of former India coach Anil Kumble's name in the photo he had shared. Interestingly, Kohli was applauded for the very same post by his Pakistani fans who were thrilled with the star batsman's tribute to ex-cricketers. There are no two ways about it - Virat Kohli is a huge fan favorite in Pakistan. Through the years, the Indian cricket captain's batting brilliance has captivated fans from across the border. The full force of the Kohli phenomenon was witnessed after the Champions Trophy final, with his remarks at the post-match press conference being widely appreciated in Pakistan. Recently, Kohli was trolled for his Teacher's Day post due to the absence of former India coach Anil Kumble's name in the photo he had shared. Interestingly, Kohli was applauded for the very same post by his Pakistani fans who were thrilled with the star batsman's tribute t

In [3]:
# category list for classification
category_list = ["sport", "world", "us", "business", "health", "entertainment", "Science and Technology"]

tok_and_tag = lambda x: pos_tag(word_tokenize(x))

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()



def get_model():
    """
    
    Returns a Multinomial Naive Bayes model which is pre-fitted with a news classification train data
    stored in train_data.csv in same folder.
    
    The model can also be exported as serialized object with pickel in order to reuse it if training model doesn't get changed.
    
    """
    # above doc string for doumentation of get model function 
    
    # found a training data based on news details and category, above arry cateory_list can categorize
    # 7 differnt category using that data.
    training_data = pd.read_csv('train_data.csv')
    
    # vectorize to convert text into usable numerical form
    X_train_counts = count_vect.fit_transform(training_data.data)

    # transforming for better feature extration
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    # using naive bayes algorithm for classification 
    model = MultinomialNB()

    clf = model.fit(X_train_tfidf,training_data.flag)
    return clf



classifier = get_model()



def process_text(text):
    """
    
    Process the text in order to find following in text:
    
    * Text Category
    * People names
    * Place names
    * Organization names
    * Tiragrams Nouns
    * Biagrams Nouns
    * Proper Noun
    * Singular Noun, Plural Noun
    * Nouns

    Returns two values first is python dictionary and second is JSON object.
    
    * First return value python dictionary can be used in python code
    * Second return value json makes it easier for result to be parsed on other programming languages easily like jsvascirpt.
    * To fetch only JSON or Dictionary, wrap the function in other function and return single value in that function.
    
    """
    pos_arr = []
    # divide in sentences 
    for sentence in sent_tokenize(text):
        # part of speech tagging after dividing sentence in words
        pos_arr.append(tok_and_tag(sentence))
        
    # removing 's from the text in order to match nouns properly with biagram and triagrams
    orig_minus_s = text.replace('\'s','')
    
    # loading Stnadord library for place, location and organization tagging
    st = StanfordNERTagger(stanford_classifier)
    
    # collect needed part of speech in different categories.
    adjective = []
    noun = []
    singular_plural_noun = []
    proper_noun = []
    
    for a in pos_arr:
        for b in a:
            if(b[1] in ('JJ','JJR','JJS','VBD','VBN')):
                adjective.append(b[0])
            if(b[1] in ('NN','NNS','NNP','NNPS')):
                noun.append(b[0])
            if(b[1] in ('NN','NNS')):
                singular_plural_noun.append(b[0])
            if(b[1] in ('NNP','NNPS')):
                proper_noun.append(b[0])
    
    # getting biagram and triagrams manually by concatinating nouns, 
    # nltk libraries can also be used but they will require 
    # separating other biagrams with noun biagrams
    
    biagram_noun = []
    triagram_noun = []
    
    for n1 in noun:
        for j1 in adjective:
            j1_n1=j1+" "+n1
            if j1_n1 in orig_minus_s:
                biagram_noun.append(j1_n1)

                for n2 in noun:
                    j1_n1_2 = j1_n1+" "+n2
                    if j1_n1_2 in orig_minus_s:
                        triagram_noun.append(j1_n1_2)

        for n2 in noun:
            n1_n2 = n1+" "+n2
            if n1_n2 in orig_minus_s:
                biagram_noun.append(n1_n2)

            for n3 in noun:
                n1_n2_n3=n1_n2+" "+n3
                if n1_n2_n3 in orig_minus_s:
                    triagram_noun.append(n1_n2_n3)

    # Finding people, location and organizations
    tagged = st.tag(noun)
    
    psudo_person = [] # single name first name or last name but we need person's full name.
    
    place = [] # This will contain location
    person = [] # This will contain persons full name
    organization = [] 

    for b in tagged:
        if b[1] == 'LOCATION':
            place.append(b[0])
        if b[1] == 'PERSON':
            psudo_person.append(b[0])
        if b[1] == 'ORGANIZATION':
            organization.append(b[0])
            
    for b in biagram_noun:
        # Assume every biagram as person
        is_person = 1
        for x in b.split():
            # if any part of biagram is not a person, it is not person.
            if x not in psudo_person:
                is_person=0
        # if both part of biagram is person add biagram in person list.
        if is_person == 1:
            person.append(b)
        
    # checking if any noun which is categorized as person is not in full name and adding it separately
    for pp in psudo_person:
        found = 0
        for p in person:
            # if single name is not contained in any person full name, it is not found
            if pp in p:
                found = 1
        # As a single name is not in person (biagram list) but classified as person, so add it separately.
        if found == 0:
            person.append(pp)
            
    # classify news
    
    doc_news = [text]
    X_news_counts = count_vect.transform(doc_news)
    X_news_tfidf = tfidf_transformer.transform(X_news_counts)
    
    news_class_num = classifier.predict(X_news_tfidf)
    
    # convering into sets to remove douplicates and make list again
    result = {}
    result['Text Topic']=list(set([category_list[news_class_num[0]]]))
    result['People']=list(set(person))
    result['Place']=list(set(place))
    result['Organization']=list(set(organization))
    result['Trigrams - Noun']=list(set(triagram_noun))
    result['Biagrams - Noun']=list(set(biagram_noun))
    result['Proper Noun']=list(set(proper_noun))
    result['Singular Noun, Plural Noun']=list(set(singular_plural_noun))
    result['Noun']=list(set(noun))
    
    # converting into JSON so that it can easily be integrated with other programming languages or web pages
    json_result = json.dumps(result, indent=4)
    return json_result, result

In [4]:
json_res,res  = process_text(orig_text)

In [5]:
# Visualize the results in result dictionary res returned
for k,v in res.items():
    print(k)
    print(v)
    print('-'*100)

Text Topic
['sport']
----------------------------------------------------------------------------------------------------
People
['Javed Miandad', 'Virat Kohli', 'Imzamam-ul-Haq', 'Imran Khan', 'Anil Kumble']
----------------------------------------------------------------------------------------------------
Place
['Pakistan', 'India']
----------------------------------------------------------------------------------------------------
Organization
[]
----------------------------------------------------------------------------------------------------
Trigrams - Noun
['captain batting brilliance', 'Anil Kumble name', 'coach Anil Kumble', 'cricket captain batting', 'Indian cricket captain', 'Virat Kohli Tweet', 'Teacher Day post', 'post-match press conference', 'Thrilled Pakistani Fans', 'India coach Anil', 'huge fan favorite', 'former India coach', 'star batsman tribute']
----------------------------------------------------------------------------------------------------
Biagrams - Noun


In [6]:
# See JSON result
print(json_res)

{
    "Text Topic": [
        "sport"
    ],
    "People": [
        "Javed Miandad",
        "Virat Kohli",
        "Imzamam-ul-Haq",
        "Imran Khan",
        "Anil Kumble"
    ],
    "Place": [
        "Pakistan",
        "India"
    ],
    "Organization": [],
    "Trigrams - Noun": [
        "captain batting brilliance",
        "Anil Kumble name",
        "coach Anil Kumble",
        "cricket captain batting",
        "Indian cricket captain",
        "Virat Kohli Tweet",
        "Teacher Day post",
        "post-match press conference",
        "Thrilled Pakistani Fans",
        "India coach Anil",
        "huge fan favorite",
        "former India coach",
        "star batsman tribute"
    ],
    "Biagrams - Noun": [
        "Thrilled Pakistani",
        "captivated fan",
        "captivated fans",
        "batsman tribute",
        "batting brilliance",
        "Kumble name",
        "same post",
        "Indian cricket",
        "Kohli Tweet",
        "cricket captain",
  

In [7]:
help(process_text)

Help on function process_text in module __main__:

process_text(text)
    Process the text in order to find following in text:
    
    * Text Category
    * People names
    * Place names
    * Organization names
    * Tiragrams Nouns
    * Biagrams Nouns
    * Proper Noun
    * Singular Noun, Plural Noun
    * Nouns
    
    Returns two values first is python dictionary and second is JSON object.
    
    * First return value python dictionary can be used in python code
    * Second return value json makes it easier for result to be parsed on other programming languages easily like jsvascirpt.
    * To fetch only JSON or Dictionary, wrap the function in other function and return single value in that function.



In [8]:
help(get_model)

Help on function get_model in module __main__:

get_model()
    Returns a Multinomial Naive Bayes model which is pre-fitted with a news classification train data
    stored in train_data.csv in same folder.
    
    The model can also be exported as serialized object with pickel in order to reuse it if training model doesn't get changed.

