In [13]:
import os
import re
import warnings
warnings.filterwarnings('ignore')

In [14]:
#Importing necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

#The below module is used to calculate the word count and vader sentiment score
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
nltk.download('punkt')
from nltk.tokenize import word_tokenize

#The below module is used to calculate the tfidf score for a sentence
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#For Cosine Similarity Calculation
import spacy
nlp = spacy.load('en_core_web_sm')

#Necessary import for plotting
from bokeh.io import output_file,save
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.palettes import Spectral6

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/c1907708/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/c1907708/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ModuleNotFoundError: No module named 'spacy'

In [15]:
#Reading Data
# function to convert the text file to dataframe
def txt_to_df(position_with_text_file, class_file, mapping_file):
    """
    Function to create a dataframe from three text files: 
    test.data.txt : The data containing token position and text
    test.gold.txt: The text file containing class data
    classes_map.txt: The text file containing class names
    """
    num_list = []
    text_list = []
    class_list = []
    
    #Block of code to read position_with_text_file to extract token position number and text
    with open(position_with_text_file, 'r', encoding='utf8') as f:
        for line in f:
            num_list.append(line.split('\t')[0])
            text_list.append(line.split('\t')[1].strip('\n'))
    
    # Block of code to read class_file to extract the class information of the text
    with open(class_file, 'r', encoding='utf8') as f:
        for line in f:
            class_list.append(line.strip('\n'))
            
    #Block of code to read the mapping file to extract each's class name mappings 
    with open(mapping_file, 'r', encoding='utf8') as f:
        for line in f:
            class_map = line
            class_map = json.loads(class_map)
    
    #Creating dataframe from extracted lists
    df = pd.DataFrame({'Token_Position': num_list, 'Text List': text_list, 'Class':class_list})
    
    #Creating a new column to store the class name
    df['Class_name'] = df['Class']
    df['Class_name'] = df['Class_name'].replace(class_map)
    return df

In [16]:
def word_count_calculation(text):
    """
    Function to return the count of words in a sentence
    """
    return len(word_tokenize(text))

In [17]:
def similarity_calculation(text1, text2):
    """
    Function to calculate the cosine similarity score for text and class name
    """
    text1 = nlp(text1)
    text2 = nlp(text2)
    return text1.similarity(text2)

In [18]:
def jaccard_similarity(query, document):
    """
    Function to calculate jaccard similarity score for text and class name
    """
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [19]:
def tf_idf_dict(doc):
    """
    Function to output the dict containing the words in the corpus and its tfidf score
    The output dict will in turn be used as an input in tfidf_score_calculator function
    """
    tfidf_vectorizer=TfidfVectorizer(use_idf=True)
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(doc)
    # get the first vector out (for the first document)
    first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]

    # place tf-idf values in a pandas data frame
    df1 = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
    df1.sort_values(by=["tfidf"],ascending=False)
    tfidf_score_dict = df1['tfidf'].to_dict()
    return tfidf_score_dict

In [20]:
def tfidf_score_calculator(lis, tfidf_dict):
    """
    Function to calculate tfidf score for text and class name
    """
    tf_idf_score_list = []
    tfidf_keys = list(tfidf_dict.keys())
    for text in lis:
        tf_idf_score = 0
        for token in word_tokenize(text):
            if token in tfidf_keys:
                tf_idf_score += tfidf_dict[token]
            else:
                tf_idf_score += 0
        tf_idf_score_list.append(tf_idf_score)   
    return (lis, tf_idf_score_list)

In [21]:
def save_table(df, group_by_col, agg_col_list, report_name):
    """
    Function to calculate the five num summary for the input columns grouped by group_by_col.
    Also, the ouput will be written as a csv file with given report_name
    """
    five_num_summary = ['min', 'max', 'mean', 'median', 'std']
    table = df.groupby(group_by_col).agg({agg_col_list:five_num_summary})

    table.to_csv(report_name+'.csv')

In [22]:
def generate_bokeh_bar_plot(df, group_by_col, agg_col, metric, folder_path):
    """
    Function to generate the Bokeh Bar chart for the metric
    """
    from bokeh.core.validation import silence
    from bokeh.core.validation.warnings import EMPTY_LAYOUT
    silence(EMPTY_LAYOUT, True)                       
    #Generating agg_df
    groupdf = df.groupby(group_by_col).agg({agg_col:metric})
    groupdf = groupdf.reset_index()
    groupdf[agg_col] = round(groupdf[agg_col], 2)
    plot_file_name = folder_path+agg_col+"_"+metric+"_plot"+'.html'
    plot_title = metric.capitalize()+" Plot of "+agg_col
    source = ColumnDataSource(data=dict(x_col=groupdf[group_by_col], y_col=groupdf[agg_col], color=Spectral6))
    labels = LabelSet(x='x_col', y='y_col', text='y_col', level='glyph',x_offset=-13.5, y_offset=0, 
                      source=source, render_mode='canvas')
    # Set the x_range to the list of categories above
    p = figure(x_range=groupdf[group_by_col], plot_width=600, plot_height=300, title=plot_title)
    # Categorical values can also be used as coordinates
    p.vbar(x='x_col', top='y_col', color='color', width=0.5, source=source)
    p.add_layout(labels)
    # Set some properties to make the plot look better
    p.xgrid.grid_line_color = None
    p.y_range.start = 0
    
    #Saving the plot
    output_file(plot_file_name)
    save(p)

In [23]:
#Framing the dataframe from all the input text files
def end_function_ouput(text_file, class_file, class_map_file, word, sub_folder):
    """
    Function to process the given three text files and generate tables and plots out of it
    """
    df = txt_to_df(text_file, class_file, class_map_file)

    # Calculating Vader sentiment score and storing it in a column
    df['vader_sentiment_score'] = df['Text List'].apply(analyzer.polarity_scores)

    #Calculating word count for Text
    df['Text_Word_Count'] = df['Text List'].map(word_count_calculation)

    #Creating a column and use it to store cosine similarity score
    df['Cosine_Similarity_Score'] = df[['Text List', 'Class_name']].apply(lambda x:similarity_calculation(*x), axis=1)

    #Creating a column and use it to store Jaccard similarity score
    df['Jaccard_similarity_score'] = df[['Class_name', 'Text List']].apply(lambda x:jaccard_similarity(*x), axis=1)

    #Code block to calculate the tfidf scores for a text and store it as column
    tfidf_score_dict = tf_idf_dict(list(df['Text List']))

    lis, score_list = tfidf_score_calculator(list(df['Text List']), tfidf_score_dict)

    df['tf_idf_score'] = score_list

    # Code block to store all the parameters in the dict in a separate column
    df['Vader_Positive_Score'] = df['vader_sentiment_score'].apply(lambda x:x['pos'])
    df['Vader_Negative_Score'] = df['vader_sentiment_score'].apply(lambda x:x['neg'])
    df['Vader_Neutral_Score'] = df['vader_sentiment_score'].apply(lambda x:x['neu'])
    df['Vader_Compound_Score'] = df['vader_sentiment_score'].apply(lambda x:x['compound'])

    scores_list = ["Text_Word_Count", "Cosine_Similarity_Score", "Jaccard_similarity_score", "tf_idf_score", 
                    "Vader_Positive_Score","Vader_Neutral_Score", "Vader_Negative_Score", "Vader_Compound_Score"]
    metrics_list = ['min', 'max', 'mean', 'median', 'std']

    #Table path derivation
    Table_path=os.getcwd()+'\\'+"Output\\"+word+"\\"+"Tables"+"\\"+sub_folder+"\\"

    if not os.path.exists(Table_path):
        os.makedirs(Table_path)

    #Plots plot derivation
    for score in scores_list:
        plot_path = os.getcwd()+'\\'+"Output\\"+word+"\\"+"Plots"+"\\"+sub_folder+"\\"+score+"\\"
        if not os.path.exists(plot_path):
            os.makedirs(plot_path)

    for score in scores_list:
        report_name = Table_path+score
        save_table(df, group_by_col="Class_name",agg_col_list= score, report_name=report_name)
        for metric in metrics_list:
            folder_path = os.getcwd()+'\\'+"Output\\"+word+"\\"+"Plots"+"\\"+sub_folder+"\\"+score+"\\"
            generate_bokeh_bar_plot(df, 'Class_name', score, metric, folder_path)