## Texts Visualisation in numerical 3-dimensional space for articles in schizophrenia area

In [1]:
# Term Frequency – Inverse Document Frequency (TF – IDF).
from sklearn.feature_extraction.text import TfidfVectorizer

# t-distributed Stochastic Neighbor Embedding.
from sklearn.manifold import TSNE

# Used for text preprocessing and part of speech tagging.
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer

# Download NLTK corpuses.
nltk.download("stopwords")
nltk.download('wordnet')

import os
import string

# Used for data representation.
import pandas as pd
import numpy as np

# Used for visualisation.
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import proj3d
import pylab as pyl
from mpl_toolkits.mplot3d import axes3d


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ksenia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ksenia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
PATH_TEST_SET = 'C:/Users/Ksenia/texts_analysing_system/UPDATED_schizophrenia_area/schizophrenia_dataset/'

In [3]:
file_name_mapping = {}
test_set = []
count = 0

for file in os.listdir(PATH_TEST_SET):
    if file.endswith(".txt"):
        file_name_mapping[count] = file
        count += 1
        with open(PATH_TEST_SET + file, 'r') as f:
            for line in f:
                line = line.decode("ascii", "ignore")
                # document should always contain only one line
                test_set.append(line)

print('***** File Name Mapping *****')           
for k, v in file_name_mapping.items():
    print('{}: "{}"'.format(k, v))

***** File Name Mapping *****
0: "Association of Hormonal Contraception With Depression.txt"
1: "Behavioral Interventions for Antipsychotic Medication Associated Obesity.txt"
2: "Care for Adolescents with Depression in Primary Care Settings.txt"
3: "Cigarette Smoking and the Onset and Persistence of Panic Attacks During Mid-Adulthood in the United States.txt"
4: "Efficacy of Topiramate in the Treatment of Crack Cocaine Dependence.txt"
5: "Efficacy, Acceptability, and Tolerability of Antipsychotics in Treatment-Resistant Schizophrenia.txt"
6: "Exaggerated Acquisition and Resistance to Extinction of Avoidance Behavior in Treated Heroin-Dependent Men.txt"
7: "Short-term Suicide Risk After Psychiatric Hospital Discharge.txt"
8: "Treatment Preferences of Psychotherapy Patients with Chronic PTSD.txt"
9: "Use of Acetaminophen (Paracetamol) During Pregnancy .txt"


In [4]:
colors_mapping = {
    0: 'navy',
    1: 'darkmagenta',
    2: 'chartreuse',
    3: 'blue',
    4: 'plum',
    5: 'gold',
    6: 'dimgrey',
    7: 'red',
    8: 'darkgreen',    
    9: 'darkgoldenrod',
}

In [5]:
def text_preprocessing(test_set):
    prepared_test_set = list()
    for document in test_set:
        
        # tokenization – process of converting a text into tokens
        tokens = word_tokenize(document)
        
        # remove stop-words
        filtered_doc_words = [word for word in tokens if word not in stopwords.words('english')]
        
        # lemmatization process - procedure of obtaining the root form of the word
        wordnet_lemmatizer = WordNetLemmatizer()
        lemmatization_words = []
        for word in filtered_doc_words:
            lemm_word = wordnet_lemmatizer.lemmatize(word)
            lemmatization_words.append(lemm_word)
        
        prepared_doc = ' '.join(lemmatization_words)
        prepared_test_set.append(prepared_doc)
        
    return prepared_test_set

prepared_test_set = text_preprocessing(test_set)

In [6]:
stemmer = nltk.stem.porter.PorterStemmer()

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]
    stems = stem_tokens(tokens, stemmer)
    return stems


In [7]:
def get_ngramms_repr(ngramm_value):
    # TF-IDF matrix for corpus, extracting 4-gramms
    corpus_tfidf = TfidfVectorizer(
        ngram_range=ngramm_value, stop_words='english', tokenizer=tokenize)
    
    corpus_representation = corpus_tfidf.fit_transform(prepared_test_set)
    features= corpus_tfidf.get_feature_names()
    corpus_representation_arr = corpus_representation.toarray()
    ngramm_representation = corpus_representation_arr.transpose()
    
    return ngramm_representation, features

In [8]:
def visualize3DData(X, mappingfeatures, features, ngramm):
    # visualize data in 3d plot with popover next to mouse position.

    fig = plt.figure(figsize=(50,50))
    ax = fig.add_subplot(111, projection = '3d')
    set_labels = set()
    
    for i in xrange((int(X.shape[0])-1)):
        xs = X[i,0]
        ys = X[i,1]
        zs = X[i,2]

        color = None
        label = None

        if i in mappingfeatures[0]:
            color = colors_mapping[0]
            label = '{}'.format(file_name_mapping[0])

        if i in mappingfeatures[1]:
            color = colors_mapping[1]
            label = '{}'.format(file_name_mapping[1])

        if i in mappingfeatures[2]:
            color = colors_mapping[2]
            label = '{}'.format(file_name_mapping[2])

        if i in mappingfeatures[3]:
            color = colors_mapping[3]
            label = '{}'.format(file_name_mapping[3])

        if i in mappingfeatures[4]:
            color = colors_mapping[4]
            label = '{}'.format(file_name_mapping[4])

        if i in mappingfeatures[5]:
            color = colors_mapping[5]
            label = '{}'.format(file_name_mapping[5])    

        if i in mappingfeatures[6]:
            color = colors_mapping[6]
            label = '{}'.format(file_name_mapping[6])

        if i in mappingfeatures[7]:
            color = colors_mapping[7]
            label = '{}'.format(file_name_mapping[7])

        if i in mappingfeatures[8]:
            color = colors_mapping[8]
            label = '{}'.format(file_name_mapping[8])

        if i in mappingfeatures[9]:
            color = colors_mapping[9]
            label = '{}'.format(file_name_mapping[9])

        if label in set_labels:
            label = None
        else:    
            set_labels.add(label)

        ax.scatter(xs, ys, zs, c=color, marker='o', alpha=0.5, s=120, label=label)
        
    ax.set_title('{N}-grams document visualisation in 3d space,\n based on TF-IDF matrix \n \n'.format(N=ngramm))
    ax.set_xlim(min(X[:,0]), max(X[:,0]))
    ax.set_ylim(min(X[:,1]), max(X[:,1]))
    ax.set_zlim(min(X[:,2]), max(X[:,2]))
    plt.legend(loc='lower left', ncol=2, fontsize=10, bbox_to_anchor=(0, 0))


    def distance(point, event):
        # return distance between mouse position and given data point

        assert point.shape == (3,), "distance: point.shape is wrong: %s, must be (3,)" % point.shape

        # Project 3d data space to 2d data space
        x2, y2, _ = proj3d.proj_transform(point[0], point[1], point[2], plt.gca().get_proj())
        # Convert 2d data space to 2d screen space
        x3, y3 = ax.transData.transform((x2, y2))

        return np.sqrt ((x3 - event.x)**2 + (y3 - event.y)**2)


    def calcClosestDatapoint(X, event):
        # calculate which data point is closest to the mouse position.

        distances = [distance (X[i, 0:3], event) for i in range(X.shape[0])]
        return np.argmin(distances)


    def annotatePlot(X, index):
        #create popover label in 3d chart
        # if we have previously displayed another label, remove it first
        if hasattr(annotatePlot, 'label'):
            annotatePlot.label.remove()
        # Get data point from array of points X, at position index
        x2, y2, _ = proj3d.proj_transform(X[index, 0], X[index, 1], X[index, 2], ax.get_proj())
        annotatePlot.label = plt.annotate( '{}'.format(features[index]),
            xy = (x2,y2), xytext = (-60, 60), textcoords = 'offset points', ha = 'right', 
            va = 'top', fontsize = 20,
            bbox = dict(boxstyle = 'round,pad=0.8', fc = 'yellow', alpha = 1, ec=(1., .5, .5)),
            arrowprops = dict(arrowstyle = 'fancy', connectionstyle = 'arc3,rad=0'))
        fig.canvas.draw()


    def onMouseMotion(event):
        # event that is triggered when mouse is moved
        # shows text annotation over data point closest to mouse
        closestIndex = calcClosestDatapoint(X, event)
        annotatePlot (X, closestIndex)

    fig.canvas.mpl_connect('motion_notify_event', onMouseMotion)
    plt.show()

In [9]:
def make_visualisation(get_ngramm_repr, features, reduced_matr, ngramm):
    
    ngramms_number, texts_number = get_ngramm_repr.shape
    
    mappingfeatures = {}

    for i in xrange(int(texts_number)):
        features_presented = []
        corpus_representation_arr = get_ngramm_repr.transpose()
        doc_raw = enumerate(corpus_representation_arr[i])
        for ngram in doc_raw:
            if ngram[1] > 0.0:
                features_presented.append(ngram[0])

        mappingfeatures[i] = features_presented
    
    X = reduced_matr
    visualize3DData(X, mappingfeatures, features, ngramm)
        

### ----------------------------- Extraction of 4-gramms and Visualisation -----------------------------

In [10]:
get_4gramm_repr = get_ngramms_repr((4,4))

repr_4gramms = get_4gramm_repr[0]
features_4gramm = get_4gramm_repr[1]

repr_4gramms.shape

(2231L, 10L)

In [11]:
# similar to PCA method for dimension reduction
newtsne = TSNE(n_components=3)

reduced_matr_4gramm = newtsne.fit_transform(repr_4gramms)
reduced_matr_4gramm.shape

(2231L, 3L)

In [12]:
make_visualisation(repr_4gramms, features_4gramm, reduced_matr_4gramm, 4)

  if self._edgecolors == str('face'):


### ----------------------------- Extraction of 3-gramms and Visualisation -----------------------------

In [13]:
get_3gramm_repr = get_ngramms_repr((3,3))

repr_3gramms = get_3gramm_repr[0]
features_3gramm = get_3gramm_repr[1]

repr_3gramms.shape

(2156L, 10L)

In [14]:
# similar to PCA method for dimension reduction
newtsne = TSNE(n_components=3)

reduced_matr_3gramm = newtsne.fit_transform(repr_3gramms)
reduced_matr_3gramm.shape

(2156L, 3L)

In [15]:
make_visualisation(repr_3gramms, features_3gramm, reduced_matr_3gramm, 3)

### ----------------------------- Extraction of 5-gramms and Visualisation -----------------------------

In [16]:
get_5gramm_repr = get_ngramms_repr((5,5))

repr_5gramms = get_5gramm_repr[0]
features_5gramm = get_5gramm_repr[1]

repr_5gramms.shape

(2254L, 10L)

In [17]:
# similar to PCA method for dimension reduction
newtsne = TSNE(n_components=3)

reduced_matr_5gramm = newtsne.fit_transform(repr_5gramms)
reduced_matr_5gramm.shape

(2254L, 3L)

In [18]:
make_visualisation(repr_5gramms, features_5gramm, reduced_matr_5gramm, 5)