In [None]:
#setup imports and paths
import os
import sys
from os.path import expanduser


HOME_DIR = expanduser("~")
sys.path.append(HOME_DIR+'/packages')

In [None]:
#load custom Midas tools
from Midas import Midas_helper
helper = Midas_helper()

In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
helper.cd_main_data()
import pandas as pd
#load main Midas labelled data table
df = pd.read_csv('midas_labeled_data_Q12018.csv')
df['midas_final_unstructured'].fillna('No Score', inplace=True)
df.head()

In [None]:
def make_wordcloud(txt):
    
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(txt)
    # Display the generated image:
    plt.figure(figsize=(9, 9))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
whole_txt = ' '.join(df['cleaned_note_unstructured'].values.tolist())
    

# Class Distribution - number of texts in each grade

In [None]:
row_nums = []
class_names = []
for i in df['midas_final_unstructured'].unique():
    class_names.append(i)
    row_nums.append(df[df['midas_final_unstructured'] == i].shape[0])
fig, ax = plt.subplots()    
width = 0.75 # the width of the bars 
ind = np.arange(len(row_nums))  # the x locations for the groups
ax.barh(ind,row_nums, width, color="blue")
ax.set_yticks(ind+width/2)
ax.set_yticklabels(class_names, minor=False)
for i, v in enumerate(row_nums):
    ax.text(v + 3, i + .25, str(v), color='blue', fontweight='bold')
plt.title('Distribution of Classes in the training data')
plt.xlabel('Number of samples (Texts)')
plt.ylabel('Classes -Grades')

# Word Representation whole data

In [None]:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(whole_txt)

# Display the generated image:
plt.figure(figsize=(9, 9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Word representation 'No Score'

In [None]:
no_txt = ' '.join(df[df['midas_final_unstructured'] == "No Score"]['cleaned_note_unstructured'].values.tolist())


wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(no_txt)

# Display the generated image:
make_wordcloud(no_txt)

# Word representation 'Invalid'

In [None]:
invalid = ' '.join(df[df['midas_final_unstructured'] == "Invalid"]['cleaned_note_unstructured'].values.tolist())
make_wordcloud(invalid)

# Word representation 'Grade 1'

In [None]:
grade1 = ' '.join(df[df['midas_final_unstructured'] == "Grade I"]['cleaned_note_unstructured'].values.tolist())
make_wordcloud(grade1)

# Word representation 'Grade 2'

In [None]:
grade2 = ' '.join(df[df['midas_final_unstructured'] == "Grade II"]['cleaned_note_unstructured'].values.tolist())
make_wordcloud(grade2)

# Word representation 'Grade 3'

In [None]:
grade3 = ' '.join(df[df['midas_final_unstructured'] == "Grade II"]['cleaned_note_unstructured'].values.tolist())
make_wordcloud(grade3)

# Word representation 'Grade 4'

In [None]:
grade4 = ' '.join(df[df['midas_final_unstructured'] == "Grade II"]['cleaned_note_unstructured'].values.tolist())
make_wordcloud(grade4)

# wordcloud after removing common words

# Unique words

In [None]:
no_score_words = no_txt.split()
invalid_words = invalid.split()
grade1_words = grade1.split()
grade2_words = grade2.split()
grade3_words = grade3.split()
grade4_words = grade4.split()


   # Average number of words in each class

In [None]:
classs_num_texts_dict = dict(zip(class_names, row_nums))
classs_num_texts_dict

In [None]:
noscore_avg = len(no_score_words)/classs_num_texts_dict['No Score']
invalid_avg = len(invalid_words)/classs_num_texts_dict['Invalid']
grade1_avg = len(grade1_words)/classs_num_texts_dict['Grade I']
grade2_avg = len(grade2_words)/classs_num_texts_dict['Grade II']
grade3_avg = len(grade3_words)/classs_num_texts_dict['Grade III']
grade4_avg = len(grade4_words)/classs_num_texts_dict['Grade IV']

x = list(classs_num_texts_dict.keys())
y = [noscore_avg, invalid_avg, grade1_avg, grade2_avg, grade3_avg, grade4_avg]
y = [int(i) for i in y]
fig, ax = plt.subplots()    
width = 0.75 # the width of the bars 
ind = np.arange(len(y))  # the x locations for the groups
ax.barh(ind,y, width, color="orange", alpha=0.5)
ax.set_yticks(ind+width/2)
ax.set_yticklabels(x, minor=False)
for i, v in enumerate(y):
    ax.text(v + 3, i + .25, str(v), color='blue', fontweight='bold')
plt.title('Average number of words in each of the 6 classes')
plt.xlabel('Number of samples (Texts)')
plt.ylabel('Classes -Grades')
plt.show()

# Unique words in 'No Score'

In [None]:
unique_noscore =  set(no_score_words) - set(invalid_words+grade1_words+grade2_words+grade3_words+grade4_words)
unique_noscore_txt = ' '.join(unique_noscore)
make_wordcloud(unique_noscore_txt)

# Unique words in 'Invalid'

In [None]:
unique_invalid = set(invalid_words) - set(no_score_words+grade1_words+grade2_words+grade3_words+grade4_words)
unique_invalid_txt = ' '.join(unique_invalid)
make_wordcloud(unique_invalid_txt)

# Unique words in 'Grade I'

In [None]:
unique_grade1 =  set(grade1_words) - set(no_score_words+invalid_words+grade2_words+grade3_words+grade4_words)
unique_grade1_txt = ' '.join(unique_grade1)
make_wordcloud(unique_grade1_txt)

# Unique words in 'Grade II'

In [None]:
unique_grade2 = set(grade2_words) - set(no_score_words+invalid_words+grade1_words+grade3_words+grade4_words)
if not unique_grade2:
    print('No unique words in Grade II')
else:
    unique_grade2_txt = ' '.join(unique_grade2)
    make_wordcloud(unique_grade2_txt)

# Unique words in 'Grade III'

In [None]:
unique_grade3 = set(grade3_words) - set(no_score_words+invalid_words+grade1_words+grade2_words+grade4_words)
if not unique_grade3:
    print('No unique words in Grade III')
else:
    unique_grade3_txt = ' '.join(unique_grade3)
    make_wordcloud(unique_grade3_txt)

# Unique words in 'Grade IV'

In [None]:
unique_grade4 = set(grade4_words) - set(no_score_words+invalid_words+grade1_words+grade2_words+grade3_words)
if not unique_grade4:
    print('No unique words in Grade IV')
else:
    unique_grade4_txt = ' '.join(unique_grade4)
    make_wordcloud(unique_grade4_txt)