In [5]:
import os
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [6]:
working_dir = os.getcwd()
os.listdir(working_dir)

['.ipynb_checkpoints',
 'accepted_projects',
 'corpus',
 'corpus.zip',
 'google-play-store-apps',
 'google-play-store-apps.zip',
 'helpline-of-all-sorts',
 'helpline-of-all-sorts.zip',
 'logistic_regression.ipynb',
 'MURA-v1.1',
 'MURA-v1.1.zip',
 'nltk',
 'opencv-text-detection',
 'opencv-text-detection.zip',
 'Papers',
 'Papers.zip',
 'Papers_Association_Rule',
 'sentiment_analysis_from_story_preprocessing.ipynb',
 'sklearn_test.ipynb',
 'Slides',
 'Tan.pdf',
 'Testing',
 'videodata.csv']

In [7]:
data_dir = os.path.join(working_dir, 'helpline-of-all-sorts')

In [15]:
data = pd.read_csv(data_dir + '/' + 'helpline_datasets.csv', header=None, usecols=[0,1])

In [16]:
data

Unnamed: 0,0,1
0,"Just like any other day, employees arrived in ...",0
1,My so-called ‘friends’ in middle school used t...,1
2,i have been called hurtful names and i have be...,1
3,at my old school kids would hit me and call me...,1
4,I had debilitating migraines for three years b...,0
5,"I love my work, but hate going each day becaus...",0
6,I have a chronic illness which was doing well ...,0
7,The other part is that sense of worthlessness....,0
8,I feel my whole body hurting. My mental health...,0
9,"As a librarian, I've been threatened with stal...",2


In [24]:
cleaned_data = data.rename(columns = {0:'story', 1: 'category'})

In [25]:
cleaned_data

Unnamed: 0,story,category
0,"Just like any other day, employees arrived in ...",0
1,My so-called ‘friends’ in middle school used t...,1
2,i have been called hurtful names and i have be...,1
3,at my old school kids would hit me and call me...,1
4,I had debilitating migraines for three years b...,0
5,"I love my work, but hate going each day becaus...",0
6,I have a chronic illness which was doing well ...,0
7,The other part is that sense of worthlessness....,0
8,I feel my whole body hurting. My mental health...,0
9,"As a librarian, I've been threatened with stal...",2


In [26]:
categories = ['work_stress', 'bullying', 'sexual_harassment']

In [34]:
work_stress = 0
bullying = 0
sexual_harassment = 0

for d in cleaned_data.index:
    if cleaned_data.category[d] == 0:
        work_stress += 1
    elif cleaned_data.category[d] == 1:
        bullying += 1
    else:
        sexual_harassment += 1

In [35]:
print('Number of Work Stress data: ', work_stress)
print('Number of Bullying data: ', bullying)
print('Number of Sexual Harassment data: ', sexual_harassment)

Number of Work Stress data:  6
Number of Bullying data:  17
Number of Sexual Harassment data:  6


In [36]:
#counting list of text ot word also converting to ascii from unicode
def text_to_word_list(text, convert_to_ascii):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = convert_to_ascii(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [39]:
cleaned_data.story = cleaned_data.story.apply(lambda x: text_to_word_list(x, unidecode))

In [40]:
cleaned_data

Unnamed: 0,story,category
0,"[just, like, any, other, day, employees, arriv...",0
1,"[my, so, called, friends, in, middle, school, ...",1
2,"[i, have, been, called, hurtful, names, and, i...",1
3,"[at, my, old, school, kids, would, hit, me, an...",1
4,"[i, had, debilitating, migraines, for, three, ...",0
5,"[i, love, my, work, but, hate, going, each, da...",0
6,"[i, have, a, chronic, illness, which, was, doi...",0
7,"[the, other, part, is, that, sense, of, worthl...",0
8,"[i, feel, my, whole, body, hurting, my, mental...",0
9,"[as, a, librarian, i, ve, been, threatened, wi...",2


In [41]:
file_model = cleaned_data.copy()

In [45]:
para = [row for row in file_model.story]
para

[['just',
  'like',
  'any',
  'other',
  'day',
  'employees',
  'arrived',
  'in',
  'the',
  'workplace',
  'sparingly',
  'filling',
  'the',
  'cubicles',
  'and',
  'getting',
  'their',
  'coffees',
  'ready',
  'once',
  'more',
  'the',
  'manager',
  'was',
  'already',
  'sitting',
  'at',
  'his',
  'desk',
  'grumbling',
  'and',
  'shouting',
  'you',
  're',
  'way',
  'too',
  'slow',
  'again',
  '!',
  'how',
  'am',
  'i',
  'supposed',
  'to',
  'get',
  'my',
  'work',
  'done',
  'with',
  'you',
  'slowing',
  'me',
  'down',
  'every',
  'day',
  '?',
  'the',
  'other',
  'employees',
  'were',
  'staring',
  'at',
  'each',
  'other',
  'they',
  'were',
  'embarrassed',
  'by',
  'his',
  'outburst',
  'but',
  'deep',
  'inside',
  'they',
  'knew',
  'he',
  'was',
  'right',
  'satisfied',
  'by',
  'the',
  'nods',
  'in',
  'the',
  'assembly',
  'the',
  'manager',
  'calmed',
  'down',
  'as',
  'usual',
  'he',
  'quickly',
  'got',
  'absorbed',
  'b

In [46]:
phrases = Phrases(para, min_count=1, progress_per=50000)

INFO - 00:02:09: collecting all words and their counts
INFO - 00:02:09: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 00:02:09: collected 3398 word types from a corpus of 3090 words (unigram + bigrams) and 29 sentences
INFO - 00:02:09: using 3398 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>


In [47]:
bigram = Phraser(phrases)

INFO - 00:02:12: source_vocab length 3398
INFO - 00:02:12: Phraser built with 157 phrasegrams


In [48]:
paragraphs = bigram[para]
paragraphs[1]

['my',
 'so',
 'called',
 'friends',
 'in',
 'middle_school',
 'used_to',
 'call_me',
 'samara',
 'yes',
 'the',
 'dead',
 'demon',
 'girl',
 'in',
 'the',
 'ring',
 'apparently',
 'i',
 'looked',
 'like',
 'her',
 '!']