In [2]:
from nltk import word_tokenize
from nltk import wordnet as wn
import re
import pandas as pd
import os
import nltk

In [145]:
'''Program Objective


1.) Target                Industry of company that is the subject of the EEOC article. 


2.) Challenges            So far, it has been challenging to identify the industry of a company that has been 
                          the subject of the EEOC Articles for which we are text mining. A general list of industry
                          names has not proven successful as the articles either do not include the specific name
                          of the industry, include a derivation, or no match at all.  In the last case, there are 
                          certain clues in the articles such as the profession of the agreeved individual and or 
                          an abstract mention of what the company does.

3.) Approach              Using the list of SIC Industry keys, create a hierarchical dictionary that captures 
                          superior, lateral as well as inferior related words.  This hierarchical approach should 
                          have a better chance of capturing the industry value from the articles. 

4.) Program Components    a.) Industry list, b.) Dictionary tree of industries, c.) function that mines text to 
                          find a match. 

5.) Tree Structure        a.) Obtain SIC Industry list
                          b.) Create Dict lvls:  Starting value is the industry. 
                              i. root_hypernyms, ii. hypernyms, iii. synsets of industry. 

6.) Matching Function     The matching will need to take place at the synsets lvl.  If we were to start with the 
                          hypernym, we could lose meaning.  Important - In light of the fact that multiple matches
                          may be obtained from any one industry, the matching function should include a voting
                          sub-function.  Take the count of the highest matches. In the event of a draw, maybe defer
                          to the industry with the highest frequency in your matches so far, else arbitrarily take
                          the first one in the list. 

7.) Result                a.) Object type = dataframe, b.) att1 = article, c.) att2 = root_hypernym, 
                          d.) att3 = hypernym, e.) att4 = target / industry. 

'''



# Add to those industries 

'Program Objective\n\n\n1.) Target                Industry of company that is the subject of the EEOC article. \n\n\n2.) Challenges            So far, it has been challenging to identify the industry of a company that has been \n                          the subject of the EEOC Articles for which we are text mining. A general list of industry\n                          names has not proven successful as the articles either do not include the specific name\n                          of the industry, include a derivation, or no match at all.  In the last case, there are \n                          certain clues in the articles such as the profession of the agreeved individual and or \n                          an abstract mention of what the company does.\n\n3.) Approach              Using the list of SIC Industry keys, create a hierarchical dictionary that captures \n                          superior, lateral as well as inferior related words.  This hierarchical approach should \n     

In [146]:
''' Stages

1. Obtain list of industries
2. Obtain synset for each industry
3. For each synset of each industry (assume multiple synsets per industry), obtain the hypernym for each and 
   root_hypernym. 
4. Create matching function
5. Create dataframe

'''

' Stages\n\n1. Obtain list of industries\n2. Obtain synset for each industry\n3. For each synset of each industry (assume multiple synsets per industry), obtain the hypernym for each and \n   root_hypernym. \n4. Create matching function\n5. Create dataframe\n\n'

In [147]:
# Definitions
'''
Synset        Synonyms for a given word.  The wn.wordnet.synsets command takes a 'string' and returns the synsets.
              If you call the wn.wordnet.synset(synset) then you have access to other functions like Lemmas
Lemmas        Synonomous words.  Object structure = word.n.01.lemma; ex = Lemma('car.n.01.auto')  
              If you call lemmas_names on wn.wordnet.synset(synset) it will give you a list 
              of those names. 

https://plot.ly/python/tree-plots/
'''



"\nSynset        Synonyms for a given word.  The wn.wordnet.synsets command takes a 'string' and returns the synsets.\n              If you call the wn.wordnet.synset(synset) then you have access to other functions like Lemmas\nLemmas        Synonomous words.  Object structure = word.n.01.lemma; ex = Lemma('car.n.01.auto')  \n              If you call lemmas_names on wn.wordnet.synset(synset) it will give you a list \n              of those names. \n\nhttps://plot.ly/python/tree-plots/\n"

In [3]:
os.chdir(r'/Users/ccirelli2/Public/Python Programing Docs/Projects/Web Scraping Project - EEOC Articles')
Url = r'EEOC Article Study - Industry Values.xlsx'
df1 = pd.read_excel(Url)
df2 = df1.set_index('Keys')
df3 = df2.iloc[0:22, :]  # Limit the scope of the dataframe.  Function is picking up nan values which is causing
                         # an error. 


In [7]:
# Dictionary of Industry Keys and Values

def Create_Industry_Dictionary(dataframe):
    Dict_major = {}
    
    # for each major industry value (column A) in the dataframe
    for major in dataframe.index:
        
        # Define the minor industry as the row values. 
        df_minor = dataframe.loc[major]
        
        # Define the industry value lvl 2 and 3 to include in the keys of your dictionary (for descriptive purp)
        Industry_lvl_2 = df_minor['Division']
        Industry_lvl_3 = df_minor['Columna2']

        # Define the list to catch the synsets generated by the below function. 
        Synset_list = []
        
        # For the value in the dataframe, the key being the value in column B 'Division'
        for value in df_minor[1:]:
            # Verify that it is an instance of a string as we have None values in the dataframe. 
            if isinstance(value, str):
                # Convert the value to lowercase. 
                value_lower = value.lower()
                # Generate synsets for this value.
                Synset = wn.wordnet.synsets(value_lower)
                # Extract the word from the synset object ('word.n.01')
                Lemma_names = [x.lemma_names() for x in Synset]
                # Lemman_names is a list of lists.  Iterate over each list. 
                for List in Lemma_names:
                    # Get the words in each sub list. 
                    for word in List:
                        # We want to end up with a Set of unique values.  Therefore, check to see 
                        # if the word is already in our list. 
                        if word not in Synset_list:
                            # If not, then append the word to our list. 
                            Synset_list.append(word)
        
        # Create the name of the Major Industry Group that will constitute the Keys of our Dict. 
        Industry_identifier = (str(major) + ' ' + Industry_lvl_2 + ' - ' + Industry_lvl_3)
        # Join the Keys with our matching values. 
        Dict_major[Industry_identifier] = Synset_list
    
    # Return our completed Industry Dictionary
    return Dict_major



In [None]:
'''Get Industry Function

Functionality           a.) Function needs to be able to take a list of tokens from a speciment text, 
                        b.) Compare those tokens to the Industry dictionary values, 
                        c.) generate a match count by major group
                        d.) return the major group with the highest count. 
'''    
    

In [5]:
# Import Texts

chdir = os.chdir(r'/Users/ccirelli2/Public/Python Programing Docs/Projects/Web Scraping project - EEOC Articles/EEOC Articles/')
Cdw = os.getcwd()
File_name_list = os.listdir()

def get_Dir_list(File):
        List = []
        for x in File:    
            if 'txt' in x:                                    # Confirm a txt file
                y = Cdw + '\\' + x  
                List.append(y)
        return List

Dir_list = get_Dir_list(File_name_list)



In [6]:
chdir = os.chdir(r'/Users/ccirelli2/Public/Python Programing Docs/Modules/')
import ccirelli2_text_analysis_module as cc



In [65]:
def get_industry_count(Token_list, Industry_dict):
    
    ## Objective = a.) Create a dictionary that represents the count of words matched between a given text and our 
    #              industry dictionary. 
    #              b.) Take the highest count as the vote for the Industry that is most likely the topic of the
    #              text. 
    # Token List = List of tokens/words for a particular text
    # Dict_Industries = Dictionary create with key words that identify certain industries. 
    # Major group = Industry major group represented by a 'key' in the dictionary. 
    
    
    # Dictonary to organize matches for each 
    Word_match_count = {}
    
    # Start the matching process. 
    
    # For each token (word) in the token list
    for token in Token_list:  

        # For each key (major group) in the Industry Dictionary
        for major_group in Dict_Industries:                         
                
            # If there is a match between the word/token in our text and the value in our dictionary
            if token in Dict_Industries[major_group]:
                
                # Check to see if the major group that coincides with the value in our dict is in our 
                # word_match dictionary.  if not, we need to create / define it. 
                
                if major_group in Word_match_count.keys():
                    
                    # if our major group key is already in the match dict, check to see if the matching value is
                    # already in our match dict.  Otherwise, we would need to create / define it. 
                    if token in Word_match_count[major_group]:
                        
                        # Given that the major group and value is in our word_match_dict, AND given that we 
                        # have a match between token and a value in the Industry_dict, add 1 to the coinciding 
                        # major_group and value.
                        Word_match_count[major_group] = Word_match_count[major_group] + [token]
                
                # Given that we have a token / value match, yet the major group is NOT in the word_count_dict keys,
                # then we need to create this key entry and assign it the value of the matching token/word. 
                else:
                    Word_match_count[major_group] = [token]
    
    # For each key, value pair, take the length of the values for a given key as the # of matches for that key. 
    for x in Word_match_count:
        Dict[x] = len(Dict[x])

    # For Major Group Legal Services, take half the count to compensate for the fact that we are using legal text.
    for x in Word_match_count:
        if '81' in x:
            Dict[x] = Dict[x] * 0.5           
                                             
    # Create a Dataframe of our Dictionary Values & Sort to rank the industry groups by num matches
    # Keep the ranking separate from the prediction so that you can investigate the results. 
    
    df = pd.DataFrame(Word_match_count, index = [1])
    df_tran = pd.DataFrame.transpose(df)
    df_sorted = df_tran.sort_values(1, ascending = False)
    
    return df_sorted


In [15]:
def Industry_prediction(Obj_from_industry_classifier):
    # Return the top row as the Prediction from our Industry Classifier function. 
    Top_row = Obj_from_industry_classifier.iloc[0]
    Industry = Top_row.name
    return Industry


In [11]:
def predict_industry_pipeline(Text):
    Text_tokenized = word_tokenize(Text)
    Clean_text = cc.text_cleaning_pipeline(Text_tokenized)
    Industry_classifier = get_industry(Clean_text, Dict_Industries_dict)
    Industry_prediction = Industry_prediction(Industry_classifier)
    return Industry_prediction




In [66]:
chdir = os.chdir(r'/Users/ccirelli2/Public/Python Programing Docs/Projects/Web Scraping project - EEOC Articles/EEOC Articles/')
Url_name_list = os.listdir()
Url = Url_name_list[2]
File = open(Url, 'rb')
Text = str(File.read()).lower()


Text_tokenized = nltk.wordpunct_tokenize(Text)
Text_nopunct = cc.strip_punctuation(Text_tokenized)
Text_strip_slashes = cc.strip_tokens_forwardSlash_x2(Text_nopunct)
Text_strip_two_vars = cc.strip_two_variable_tokens(Text_strip_slashes)
Text_strip_stop_words = cc.strip_stop_words(Text_strip_two_vars)
Text_get_isalpha = cc.get_isalpha(Text_strip_stop_words)



In [67]:
Get_industry_count = get_industry_count(Text_get_isalpha, Industry_dict)


In [68]:
Get_industry_count

Unnamed: 0,1
Major Group 81=> Services-Law firm,2.0
Major Group 73=> Services-Business Services,1.0
Major Group 82=> Services-Education,1.0
Major Group 87=> Services-Engineering,1.0
Major Group 91-99=> Services-Government,1.0


In [57]:
Industry_prediction(Get_industry_count)

'Major Group 73=>  Services-Business Services'

In [69]:
print(Text)

b'<div class="cs_control" id="cs_control_4119">\r\n<div align="left">\r\n<p><em><strong>press release</strong></em><br/>\r\r\n\t1-27-10</p>\r\n</div>\r\n<div align="center">\r\n<div class="caption">\r\n<p></p>\r\n</div>\r\n<h1>$428,500 decree ends suit against eagle wings for sexual harassment, retaliation and disability bias</h1>\r\n</div>\r\n<div align="left"><div align="center"><em><strong>eeoc case challenged treatment of women at rantoul facility</strong></em><br/>\r\n</div>\r\n<p><br/>\r\nurbana, ill. \x96 the federal district court in urbana, ill., today entered a consent decree under which eagle wings industries, inc. will pay $428,500 to a class of female employees who, the u.s. equal employment opportunity commission (eeoc) alleged, encountered sexual harassment at the company\x92s rantoul, ill., facility.\xa0 this amount includes the attorney\x92s fees for one of the class members.<br/>\r\n\xa0<br/>\r\naccording to the eeoc suit, the automobile parts manufacturer discriminat

In [58]:
# chdir = os.chdir(r'/Users/ccirelli2/Public/Python Programing Docs/Projects/Web Scraping project - EEOC Articles/EEOC Articles/')
# File_name_list = os.listdir()

# for x in File_name_list[:1]:
#     File = open(x, 'rb')
#     Text_bytes = File.read()
#     Text_text = str(Text_bytes)
#     predict_industry_pipeline(Text_bytes)
