In [1]:
import os
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [2]:
working_dir = os.getcwd()
os.listdir(working_dir)

['opencv-text-detection.zip',
 '.ipynb_checkpoints',
 'accepted_projects',
 'Andrew W. Trask - Grokking Deep Learning-Manning Publications (2019).pdf',
 'Collective_Dataset',
 'corpus',
 'corpus.zip',
 'doc2vec.ipynb',
 'doc2vec_with_kfold.html',
 'doc2vec_with_kfold.ipynb',
 'Final Slides and Books',
 'google-play-store-apps',
 'google-play-store-apps.zip',
 'helpline-of-all-sorts',
 'helpline-of-all-sorts.zip',
 'kmeans_clustering.ipynb',
 'labels.txt',
 'logistic_regression.ipynb',
 'MURA-v1.1',
 'MURA-v1.1.zip',
 'nltk',
 'opencv-text-detection',
 'Papers',
 'Papers.zip',
 'Papers_Association_Rule',
 'processed_dataset.csv',
 'Results',
 'reviews.txt',
 'sentiment_analysis_from_story_preprocessing.ipynb',
 'sentiment_dictionary.csv',
 'sklearn_test.ipynb',
 'Slides',
 'Story-categorization-using-NLP',
 'Tan.pdf',
 'Testing',
 'Udacity',
 'videodata.csv',
 'word2vec.model',
 'word2vector_test.ipynb']

In [3]:
data_dir = os.path.join(working_dir, 'Collective_Dataset')

In [4]:
data = pd.read_csv(data_dir + '/' + '419_data - Sheet1.csv', usecols=[0,1])

In [5]:
data

Unnamed: 0,story,category
0,"Just like any other day, employees arrived in ...",0
1,My so-called ‘friends’ in middle school used t...,1
2,i have been called hurtful names and i have be...,1
3,at my old school kids would hit me and call me...,1
4,I had debilitating migraines for three years b...,0
...,...,...
184,This happened when I was about 23 and working ...,2
185,During the holidays I’d wear a santa hat to wo...,2
186,"I’m currently working at Starbucks, and as a 2...",2
187,I’ve had to deal with male customers who stop ...,2


In [6]:
cleaned_data = data.rename(columns = {0:'story', 1: 'category'})

In [7]:
cleaned_data

Unnamed: 0,story,category
0,"Just like any other day, employees arrived in ...",0
1,My so-called ‘friends’ in middle school used t...,1
2,i have been called hurtful names and i have be...,1
3,at my old school kids would hit me and call me...,1
4,I had debilitating migraines for three years b...,0
...,...,...
184,This happened when I was about 23 and working ...,2
185,During the holidays I’d wear a santa hat to wo...,2
186,"I’m currently working at Starbucks, and as a 2...",2
187,I’ve had to deal with male customers who stop ...,2


In [8]:
categories = ['work_stress', 'bullying', 'sexual_harassment']

In [9]:
work_stress = 0
bullying = 0
sexual_harassment = 0

for d in cleaned_data.index:
    if cleaned_data.category[d] == 0:
        work_stress += 1
    elif cleaned_data.category[d] == 1:
        bullying += 1
    else:
        sexual_harassment += 1

In [10]:
print('Number of Work Stress data: ', work_stress)
print('Number of Bullying data: ', bullying)
print('Number of Sexual Harassment data: ', sexual_harassment)

Number of Work Stress data:  26
Number of Bullying data:  94
Number of Sexual Harassment data:  69


In [11]:
#counting list of text ot word also converting to ascii from unicode
def text_to_word_list(text, convert_to_ascii):
    ''' Pre process and convert texts to a list of words 
    method inspired by method from eliorc github repo: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb'''
    text = convert_to_ascii(text)
    text = str(text)
    text = text.lower()

    # Clean the text
    text = sub(r"[^A-Za-z0-9^,!?.\/'+]", " ", text)
    text = sub(r"\+", " plus ", text)
    text = sub(r",", " ", text)
    text = sub(r"\.", " ", text)
    text = sub(r"!", " ! ", text)
    text = sub(r"\?", " ? ", text)
    text = sub(r"'", " ", text)
    text = sub(r":", " : ", text)
    text = sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [12]:
cleaned_data.story = cleaned_data.story.apply(lambda x: text_to_word_list(x, unidecode))

In [13]:
cleaned_data

Unnamed: 0,story,category
0,"[just, like, any, other, day, employees, arriv...",0
1,"[my, so, called, friends, in, middle, school, ...",1
2,"[i, have, been, called, hurtful, names, and, i...",1
3,"[at, my, old, school, kids, would, hit, me, an...",1
4,"[i, had, debilitating, migraines, for, three, ...",0
...,...,...
184,"[this, happened, when, i, was, about, 23, and,...",2
185,"[during, the, holidays, i, d, wear, a, santa, ...",2
186,"[i, m, currently, working, at, starbucks, and,...",2
187,"[i, ve, had, to, deal, with, male, customers, ...",2


In [14]:
file_model = cleaned_data.copy()

In [15]:
para = [row for row in file_model.story]
para

[['just',
  'like',
  'any',
  'other',
  'day',
  'employees',
  'arrived',
  'in',
  'the',
  'workplace',
  'sparingly',
  'filling',
  'the',
  'cubicles',
  'and',
  'getting',
  'their',
  'coffees',
  'ready',
  'once',
  'more',
  'the',
  'manager',
  'was',
  'already',
  'sitting',
  'at',
  'his',
  'desk',
  'grumbling',
  'and',
  'shouting',
  'you',
  're',
  'way',
  'too',
  'slow',
  'again',
  '!',
  'how',
  'am',
  'i',
  'supposed',
  'to',
  'get',
  'my',
  'work',
  'done',
  'with',
  'you',
  'slowing',
  'me',
  'down',
  'every',
  'day',
  '?',
  'the',
  'other',
  'employees',
  'were',
  'staring',
  'at',
  'each',
  'other',
  'they',
  'were',
  'embarrassed',
  'by',
  'his',
  'outburst',
  'but',
  'deep',
  'inside',
  'they',
  'knew',
  'he',
  'was',
  'right',
  'satisfied',
  'by',
  'the',
  'nods',
  'in',
  'the',
  'assembly',
  'the',
  'manager',
  'calmed',
  'down',
  'as',
  'usual',
  'he',
  'quickly',
  'got',
  'absorbed',
  'b

In [16]:
phrases = Phrases(para, min_count=1, progress_per=50000)

INFO - 02:39:39: collecting all words and their counts
INFO - 02:39:39: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 02:39:39: collected 22555 word types from a corpus of 36069 words (unigram + bigrams) and 189 sentences
INFO - 02:39:39: using 22555 counts as vocab in Phrases<0 vocab, min_count=1, threshold=10.0, max_vocab_size=40000000>


In [17]:
bigram = Phraser(phrases)

INFO - 02:39:40: source_vocab length 22555
INFO - 02:39:40: Phraser built with 1425 phrasegrams


In [18]:
paragraphs = bigram[para]
paragraphs[1]

['my',
 'so',
 'called',
 'friends',
 'in',
 'middle_school',
 'used_to',
 'call_me',
 'samara',
 'yes',
 'the',
 'dead',
 'demon',
 'girl',
 'in',
 'the',
 'ring',
 'apparently',
 'i',
 'looked_like',
 'her',
 '!']

In [19]:
model = Word2Vec(min_count = 2,
                window = 10,
                size = 300,
                sample = 1e-5,
                alpha = 0.03,
                min_alpha = 0.0007,
                negative = 20,
                workers = multiprocessing.cpu_count()-1)

In [20]:
start = time()
model.build_vocab(paragraphs, progress_per = 50000)
print('Time to build vocabularies: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 02:39:47: collecting all words and their counts
INFO - 02:39:47: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 02:39:47: collected 4607 word types from a corpus of 31133 raw words and 189 sentences
INFO - 02:39:47: Loading a fresh vocabulary
INFO - 02:39:47: effective_min_count=2 retains 2610 unique words (56% of original 4607, drops 1997)
INFO - 02:39:47: effective_min_count=2 leaves 29136 word corpus (93% of original 31133, drops 1997)
INFO - 02:39:47: deleting the raw counts dictionary of 4607 items
INFO - 02:39:47: sample=1e-05 downsamples 2610 most-common words
INFO - 02:39:47: downsampling leaves estimated 4108 word corpus (14.1% of prior 29136)
INFO - 02:39:47: estimated required memory for 2610 words and 300 dimensions: 7569000 bytes
INFO - 02:39:47: resetting layer weights


Time to build vocabularies: 0.01 mins


In [21]:
model.train(paragraphs, total_examples = model.corpus_count, epochs = 30, report_delay = 1)
print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 02:39:49: training model with 5 workers on 2610 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=20 window=10
INFO - 02:39:50: worker thread finished; awaiting finish of 4 more threads
INFO - 02:39:50: worker thread finished; awaiting finish of 3 more threads
INFO - 02:39:50: worker thread finished; awaiting finish of 2 more threads
INFO - 02:39:50: worker thread finished; awaiting finish of 1 more threads
INFO - 02:39:50: worker thread finished; awaiting finish of 0 more threads
INFO - 02:39:50: EPOCH - 1 : training on 31133 raw words (4022 effective words) took 0.1s, 49231 effective words/s
INFO - 02:39:50: worker thread finished; awaiting finish of 4 more threads
INFO - 02:39:50: worker thread finished; awaiting finish of 3 more threads
INFO - 02:39:50: worker thread finished; awaiting finish of 2 more threads
INFO - 02:39:50: worker thread finished; awaiting finish of 1 more threads
INFO - 02:39:50: worker thread finished; awaiting finish of 0 more threads


INFO - 02:39:51: worker thread finished; awaiting finish of 1 more threads
INFO - 02:39:51: worker thread finished; awaiting finish of 0 more threads
INFO - 02:39:51: EPOCH - 17 : training on 31133 raw words (4104 effective words) took 0.1s, 59203 effective words/s
INFO - 02:39:51: worker thread finished; awaiting finish of 4 more threads
INFO - 02:39:51: worker thread finished; awaiting finish of 3 more threads
INFO - 02:39:51: worker thread finished; awaiting finish of 2 more threads
INFO - 02:39:51: worker thread finished; awaiting finish of 1 more threads
INFO - 02:39:51: worker thread finished; awaiting finish of 0 more threads
INFO - 02:39:51: EPOCH - 18 : training on 31133 raw words (4112 effective words) took 0.1s, 59886 effective words/s
INFO - 02:39:51: worker thread finished; awaiting finish of 4 more threads
INFO - 02:39:51: worker thread finished; awaiting finish of 3 more threads
INFO - 02:39:51: worker thread finished; awaiting finish of 2 more threads
INFO - 02:39:51: w

Time to train the model: 0.08 mins


In [22]:
model.init_sims(replace = True)

INFO - 02:39:57: precomputing L2-norms of word weight vectors


In [23]:
model.save("word2vec.model")

INFO - 02:40:00: saving Word2Vec object under word2vec.model, separately None
INFO - 02:40:00: not storing attribute vectors_norm
INFO - 02:40:00: not storing attribute cum_table
INFO - 02:40:00: saved word2vec.model


In [24]:
file_to_export = file_model.copy()
file_to_export['story'] = file_to_export.story

In [25]:
file_to_export.story = file_to_export.story.str.join(' ')
file_to_export.story = file_to_export.story.apply(lambda x: ' '.join(bigram[x]))
file_to_export.category = file_to_export.category.astype('int8')

In [None]:
file_to_export[['story', 'category']].to_csv('processed_dataset.csv', index = False)