# JNL Classification App

References:

* https://www.w3schools.com/colors/colors_picker.asp
* https://stackoverflow.com/questions/47949173/deploy-a-python-app-to-heroku-using-conda-environments-instead-of-virtualenv

When deploying, remember to change Bs4 `html5lib` to `html.parser`.

In [1]:
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import punkt
from nltk.corpus.reader import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from bs4 import BeautifulSoup
import numpy as np
import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_table
import dash_renderer
from dash.dependencies import Input, Output, State
import plotly.graph_objs as go
import re

## 1. Importing inputs

### 1.1. Trained Model

The best performing model is the SVM. We'll use it in the app.

In [2]:
path_models = "../../2_Machine_Learning/3_Model_Training/Models/"

# SVM
path_svm = path_models + 'best_svc.pickle'
with open(path_svm, 'rb') as data:
    svc_model = pickle.load(data)

### 1.2. TF-IDF object

In [3]:
path_tfidf = "../../2_Machine_Learning/2_Feature_Engineering/Pickles/tfidf.pickle"

with open(path_tfidf, 'rb') as data:
    tfidf = pickle.load(data)

### 1.3. Category mapping dictionary

In [4]:
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4,
    'other':5
}

## 2. Definition of functions

### 2.1. Feature Engineering Functions

In [5]:
punctuation_signs = list("?:!.,;")
stop_words = list(stopwords.words('english'))

def create_features_from_text(text):
    
    # Dataframe creation
    data = [[text]]
    df = pd.DataFrame(data, columns = ['Content'])
    df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')
    df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()
    df['Content_Parsed_3'] = df['Content_Parsed_2']
    for punct_sign in punctuation_signs:
        df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')
    df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")
    lemmatized_text_list = []
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    text = df.loc[0]['Content_Parsed_4']
    text_words = text.split(" ")
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
    lemmatized_text = " ".join(lemmatized_list)    
    lemmatized_text_list.append(lemmatized_text)
    df['Content_Parsed_5'] = lemmatized_text_list[0]
    df['Content_Parsed_6'] = df['Content_Parsed_5']
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')
    
    df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})
    df = df['Content_Parsed']
    # TF-IDF
    features = tfidf.transform(df).toarray()
    
    return features

In [6]:
def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

### 2.2. Prediction Functions

In [7]:
def predict_from_text(text):
    
    # Predict using the input model
    pred = svc_model.predict(create_features_from_text(text))
    prediction_svc = pred[0]
    prediction_svc_proba = svc_model.predict_proba(create_features_from_text(text))[0]
    
    print("pred: %s." %(pred))
    print("prediction_svc: %s." %(prediction_svc))
    print("prediction_svc_proba: %s." %(prediction_svc_proba))
    
    # Return result
    category_svc = get_category_name(prediction_svc)
    
    print("The predicted category using the SVM model is %s." %(category_svc) )
    print("The conditional probability is: %a" %(prediction_svc_proba.max()*100))
    
    returnStatement = "The predicted category using the SVM model is: "
    returnStatement += str(category_svc)
    returnStatement += ".\nThe conditional probability is: "
    returnStatement += str(prediction_svc_proba.max()*100)
    
    return returnStatement, category_svc

In [8]:
def complete_df(df, categories):
    df['Prediction'] = categories
    return df

Finally, the whole process can be written in these 4 lines of code:

```python
# Predict
predictions = predict_from_text(value)
    
#Output
if n_clicks > 0:
    return 'You have entered: \n{}'.format(predictions)
```

## 3. Dash App

In [9]:
# Stylesheet
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

# Colors
colors = {
    'background': '#ECECEC',  
    'text': '#696969',
    'titles': '#599ACF',
    'blocks': '#F7F7F7',
    'graph_background': '#F7F7F7',
    'banner': '#C3DCF2'

}

# Markdown text
markdown_text1 = '''

This application takes in TC Journal File Logs, predicts their category between **Politics**, **Business**, **Entertainment**, **Sport**, **Tech** and **Other** and then shows a summary.

The logs are converted into a numeric feature vector with *TF-IDF vectorization*. Then, a *Support Vector Classifier* is applied to predict each category.

This app is meant for POC purposes.

Please enter the logs and press the **Classify** button.

'''

markdown_text2 = '''

 Created by Tejas Bhatia. Visit the [github repo](https://github.com/imp924/jnlPrediction).

 *Disclaimer: this app is a Proof-of-Concept Side Project. Only a small section of Teamcenter logs are tested.*

'''



app.layout = html.Div(style={'backgroundColor':colors['background']}, children=[
    
    # Space before title
    html.H1(children=' ',
            style={'padding': '10px'}
           ),
    
    # Title
    html.Div(
        [
            html.H3(children='JNL Classification App',
                    style={"margin-bottom": "0px"}
                   ),
            html.H6(children='A Machine Learning based app')
        ],
        style={
            'textAlign': 'center',
            'color': colors['text'],
            #'padding': '0px',
            'backgroundColor': colors['background']
              },
        className='banner',
            ),
    

    # Space after title
    html.H1(children=' ',
            style={'padding': '1px'}),


    # Text boxes
    html.Div(
        [
            html.Div(
                [
                    html.H6(children='What does this app do?',
                            style={'color':colors['titles']}),
                    
                    html.Div(
                        [dcc.Markdown(children=markdown_text1),],
                        style={'font-size': '12px',
                               'color': colors['text']}),
                    
                    html.Div([
                        dcc.Textarea(
                            id='textarea-state-example',
                            value='Textarea content initialized\nwith multiple lines of text',
                            style={'width': '100%', 'height': 200},
                        ),
                        html.Button('Submit', id='textarea-state-example-button', n_clicks=0)
                        
                    ])                                                  
                ],
                     style={'backgroundColor': colors['blocks'],
                            'padding': '20px',
                            'border-radius': '5px',
                            'box-shadow': '1px 1px 1px #9D9D9D'},
                     className='one-half column'),
            
            html.Div(
                [
                    html.H6("Graphic summary",
                            style={'color': colors['titles']}),
                    
                    html.Div(id='textarea-state-example-output', style={'whiteSpace': 'pre-line'}),
                    
                    html.Div([
                         dcc.Graph(id='graph2', style={'height': '300px'})
                         ],
                         style={'backgroundColor': colors['blocks'],
                                'padding': '20px'}
                    )
                ],
                     style={'backgroundColor': colors['blocks'],
                            'padding': '20px',
                            'border-radius': '5px',
                            'box-shadow': '1px 1px 1px #9D9D9D'},
                     className='one-half column')

        ],
        className="row flex-display",
        style={'padding': '20px',
               'margin-bottom': '0px'}
    ),
    
        
    # Space
    html.H1(id='space2', children=' '),
        
    
    # Final paragraph
    html.Div(
            [dcc.Markdown(children=markdown_text2),],
            style={'font-size': '12px',
                   'color': colors['text']}),

    
    # Hidden div inside the app that stores the intermediate value
    html.Div(id='intermediate-value', style={'display': 'none'})
    

])

@app.callback(
    Output('textarea-state-example-output', 'children'),
    Output('graph2', 'figure'),
    Input('textarea-state-example-button', 'n_clicks'),
    State('textarea-state-example', 'value')
)
def scrape_and_predict(n_clicks, value):
            

    # Predict
    predStatement, predictions = predict_from_text(value)
    
    #Output
    statement = ""
    if n_clicks > 0:
        statement = 'You have entered: \n{}'.format(predStatement)
    
    data = [[predictions]]
    df = pd.DataFrame(data, columns = ['Prediction'])
    
    #dfJson = df.to_json(date_format='iso', orient='split'), ' '
    figure = update_piechart(df)
    
    return statement, figure
        
def update_piechart(df):
    
    # Create a summary df
    df_sum = df['Prediction'].value_counts()

    # Create x and y arrays for the bar plot
    x = ['Politics', 'Business', 'Entertainment', 'Sport', 'Tech', 'Other']
    y = [[df_sum['politics'] if 'politics' in df_sum.index else 0][0],
         [df_sum['business'] if 'business' in df_sum.index else 0][0],
         [df_sum['entertainment'] if 'entertainment' in df_sum.index else 0][0],
         [df_sum['sport'] if 'sport' in df_sum.index else 0][0],
         [df_sum['tech'] if 'tech' in df_sum.index else 0][0],
         [df_sum['other'] if 'other' in df_sum.index else 0][0]]
    
    # Create plotly figure
    figure = {
        'data': [
            {'values': y,
             'labels': x, 
             'type': 'pie',
             'hole': .4,
             'name': '% of news articles',
             'marker': {'colors': ['rgb(62, 137, 195)',
                                   'rgb(167, 203, 232)',
                                   'rgb(197, 223, 242)',
                                   'rgb(51, 113, 159)',
                                   'rgb(64, 111, 146)',
                                   'rgb(31, 84, 132)']},

            }
        ],
        
        'layout': {
            'title': 'News articles by newspaper',
            'plot_bgcolor': colors['graph_background'],
            'paper_bgcolor': colors['graph_background'],
            'font': {
                    'color': colors['text'],
                    'size': '10'
            }
        }
        
    }
    
    return figure
    
    
    
# Loading CSS
app.css.append_css({"external_url": "https://codepen.io/chriddyp/pen/bWLwgP.css"})
app.css.append_css({"external_url": "https://codepen.io/chriddyp/pen/brPBPO.css"})


In [None]:
app.run_server(debug=False)

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
If you added this file with `app.scripts.append_script` or `app.css.append_css`, use `external_scripts` or `external_stylesheets` instead.
See https://dash.plot.com/external-resources
If you added this file with `app.scripts.append_script` or `app.css.append_css`, use `external_scripts` or `external_stylesheets` instead.
See https://dash.plot.com/external-resources
127.0.0.1 - - [23/Feb/2021 21:55:13] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [23/Feb/2021 21:55:15] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [23/Feb/2021 21:55:15] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [23/Feb/2021 21:55:16] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


pred: [3].
prediction_svc: 3.
prediction_svc_proba: [0.13606028 0.32575855 0.04956321 0.42831871 0.06029925].
The predicted category using the SVM model is sport.
The conditional probability is: 42.83187113511851


127.0.0.1 - - [23/Feb/2021 21:55:24] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -


pred: [2].
prediction_svc: 2.
prediction_svc_proba: [7.80802608e-02 9.69389503e-04 9.19216468e-01 5.03485054e-04
 1.23039642e-03].
The predicted category using the SVM model is politics.
The conditional probability is: 91.92164681964248


In [None]:
predictions = "business"
#df_show_info = pd.DataFrame()
#df = complete_df(df_show_info, predictions)

data = [[predictions]]
df = pd.DataFrame(data, columns = ['Prediction'])
#df = pd.read_json(jsonified_df, orient='split')
print(df.head())
# Create a summary df
df_sum = df['Prediction'].value_counts()
print("------------------")
print(df_sum.head())
# Create x and y arrays for the bar plot
x = ['Politics', 'Business', 'Entertainment', 'Sport', 'Tech', 'Other']
y = [[df_sum['politics'] if 'politics' in df_sum.index else 0][0],
     [df_sum['business'] if 'business' in df_sum.index else 0][0],
     [df_sum['entertainment'] if 'entertainment' in df_sum.index else 0][0],
     [df_sum['sport'] if 'sport' in df_sum.index else 0][0],
     [df_sum['tech'] if 'tech' in df_sum.index else 0][0],
     [df_sum['other'] if 'other' in df_sum.index else 0][0]]

print(x)
print(y)
# Create plotly figure
figure = {
    'data': [
        {'values': y,
         'labels': x, 
         'type': 'pie',
         'hole': .4,
         'name': '% of news articles',
         'marker': {'colors': ['rgb(62, 137, 195)',
                               'rgb(167, 203, 232)',
                               'rgb(197, 223, 242)',
                               'rgb(51, 113, 159)',
                               'rgb(64, 111, 146)',
                               'rgb(31, 84, 132)']},

        }
    ],
    
    'layout': {
        'title': 'News articles by newspaper',
        'plot_bgcolor': colors['graph_background'],
        'paper_bgcolor': colors['graph_background'],
        'font': {
                'color': colors['text'],
                'size': '10'
        }
    }
    
}