# Import Packages

In [1]:
import pandas as pd
import numpy as np
import glob
import re

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
stop_words=set(stopwords.words("english"))

# Read Data and Construct Corpus

In [4]:
company_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/BusinessInsiderNLP/labels/companies.csv', 
                         names = ['Company'], dtype = {'Company' : str})
percent_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/BusinessInsiderNLP/labels/percentage.csv',
                         names = ['percent'], dtype = {'percent':str}, engine = 'python')
ceo_df = pd.read_csv('/Users/aarij/Desktop/python-projects/iems-308/BusinessInsiderNLP/labels/ceo.csv',
                         names = ['first', 'last'], dtype = {'ceo':str}, engine = 'python')

In [5]:
files = glob.glob('/Users/aarij/Desktop/python-projects/iems-308/BusinessInsiderNLP/articles/*/*.txt')

In [6]:
# Creating the corpus 
corpus = []
for file in files:
   # open the file and then call .read() to get the text 
    with open(file, encoding="ISO-8859-1") as f:
        text = f.read()
    corpus.append(text)

In [7]:
# Tokenizing by sentence 
sentences =[]
for file in range(len(corpus)):
    corpus[file] = re.sub(r'[^\x00-\x7f]|[*]',r'', corpus[file])
    sentences.append(sent_tokenize(corpus[file]))

In [8]:
# Flattening a list 
flat = [item for sublist in sentences for item in sublist]
sentences = flat

# CEOs


## Feature Construction for CEO Name Recognition

### Sentence Specific  Features

In [9]:
# Determines the length of a sentence
def length_sentence(sentence):
    return len(sentence)

In [10]:
# Checks if CEO is in a sentence
def ceo_in_sentence(sentence):
    ceo = 0
    if re.search(r'ceo', sentence) != None or re.search(r'chief executive officer', sentence.lower()) != None: 
        ceo = 1
    return ceo

### Candidate Name Specific Features

In [11]:
# Finds the length of a candidate
def length_candidate(item):
    return len(item)

In [12]:
# Checks if CEO is close to a candidate
def ceo_close(item, sent):
    x = re.search('ceo', sent)
    if x == None: return 0
    y = re.search(re.compile(item), sent)
    if y == None: return 0
    if abs(x.start() - y.start()) < 20:
        return 1
    return 0

### Constiuent Names Features

In [13]:
# Checks if a list of words contains any stop words 
def stop_in_name(words):
    stop = 0
    for word in words:
        if word.lower() in stop_words:
            stop = 1
            return stop
    return stop

### Feature Construction

In [14]:
# Function which extracts candidates and creates features for CEO names
def feature_creator_ceo (sentences):
    candidates = []
    for sent in range(len(sentences)):
        curr = sentences[sent]
        # Change CEO to ceo for better RegEx Parsing
        curr = re.sub(r'CEO', 'ceo', curr) 
        # Extract all candidates
        extract = re.findall(r'[A-Z][a-z]+ [A-Z][a-z]+', curr)
        if extract != []:
            # Extract sentence features
            sentence_length = length_sentence(curr)
            ceo = ceo_in_sentence(curr)
            for item in extract:
                # Extract candidate specific features
                candidate_length = length_candidate(item)
                close = ceo_close(item, curr)
                # Extract name specific features
                names = re.split(r'[ ]', item)
                stop = stop_in_name(names)
                # Append all values to a list
                candidates.append([item, sentence_length, ceo, candidate_length, close, stop, curr, sent])
    return candidates

In [15]:
candidates = feature_creator_ceo(sentences)

In [16]:
# Create the dataframe for candidates and feature space
df = pd.DataFrame(candidates, columns=['candidate', 'sentence_length', 'ceo_in_sentence', 
                                       'candidate_length', 'ceo_close', 'stopword_in_candidate', 
                                       'sentence', 'sentence_index'])

## Model Training and Testing

In [17]:
# Add labels to the dataframe 
ceo_df['name'] = ceo_df['first'] + ' ' + ceo_df['last']
labels = []
values = set(ceo_df['name'].tolist())
for candidate in range(df.shape[0]):
    if df.loc[candidate, 'candidate'] in values:labels.append(1)
    else: labels.append(0)
df['label'] = labels

In [18]:
# Fitting model to training data
# Select features
features = ['sentence_length', 'ceo_in_sentence', 
            'candidate_length', 'ceo_close', 'stopword_in_candidate']
scaler = StandardScaler()
X = df[features].copy()
# Standardize features
scaler.fit(X)
X[features] = scaler.fit_transform(X[features])
y = df['label']

In [19]:
# Randomly partioning training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.50) 
log = LogisticRegression()
# Fitting model to training partition
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
# Predicting Training Data
results = log.predict(X_test)
metrics.accuracy_score(results, y_test)

0.9730454296716151

In [21]:
correct = X_test.copy()
correct['predict'] = pd.Series(results)
correct['label'] = y_test
label1 = correct[correct['label'] == 1].shape[0]
result1 = correct[correct['predict'] == 1].shape[0]
print(f'There are {label1} CEOs according to the labels and {result1} candiates were classified as CEOs')

There are 6091 CEOs according to the labels and 989 candiates were classified as CEOs


## Extracting Entities

In [22]:
# Predicting on entire dataset
feature_space = df.copy()
feature_space[features] = scaler.fit_transform(feature_space[features])
predictions = log.predict(feature_space[features])

In [23]:
# Getting unique entities
df['predict'] = predictions
results = set(df[df['predict'] == 1].candidate.tolist())

In [24]:
# Writing to file
with open('ceo_names', 'w') as f:
    for ceo in results:
        f.write(ceo+'\n')

# Companies 

### Sentence Specific 

In [25]:
# Determines if the word 'company' is in a sentence
def company_in_sentence(sentence):
    ret = 0
    if re.search(r'company', sentence.lower()) != None:
        ret = 1
    return ret

In [26]:
# Determines if the word 'stock' is in a sentence
def stock_in_sentence(sentence):
    ret = 0
    if re.search(r'stock', sentence.lower()) != None:
        ret = 1
    return ret

In [27]:
# Determine if the word 'shares' is in a sentence 
def shares_in_sentence(sentence):
    ret = 0
    if re.search(r'share', sentence.lower()) != None:
        ret = 1
    return ret

In [28]:
# Determines if a variation of 'trade' is in a sentence
def trade_in_sentence(sentence):
    ret = 0
    if re.search(r'trad', sentence.lower()) != None: #trad bc looking for 'trade', 'trading', 'trades', etc. 
        ret = 1
    return ret

### Company Specific 

In [29]:
# Removes a space from the end of a word
def space_remove(item):
    if item[len(item) - 1] == ' ':
        return item[0:len(item) - 1]
    return item

In [30]:
# Returns the length of a word
def length_of_company(item):
    return len(item)

In [31]:
# Determines if a word is plural
def plural_word(item):
    plural = 0
    if item[len(item) - 1] == 's':
        plural = 1
    return plural

In [32]:
# Returns the number of words a list of words
def number_of_words(words):
    return len(words)

In [33]:
# Returns where a word occurs in a sentence
def location_in_sentence(sentence, item):
    return re.search(re.compile(item), sentence).start()    

In [34]:
# Determines if a stop word is in a list of words
def stop_word(words):
    stop = 0
    for word in words:
        if word.lower() in stop_words:
            stop = 1
            return stop
    return stop

In [35]:
def company_words(words):
    values = set(['corp', 'corp.', 'corporation', 'group', 'holding', 'inc', 'inc.', 'company', 'association', 'foundation', 'ltd'])
    for item in words:
        word = item.lower()
        if word in values:
            return 1
    return 0

In [36]:
# Extracting candidates and feature space for companies 
def feature_creator_companies (sentences):
    candidates = []
    for sent in range(len(sentences)):
        curr = sentences[sent]
        extract = re.findall(r'(([A-Z][a-z]+ ?)+)', curr)
        if extract != []:
            # Getting only the candidate
            flat = [item[0] for item in extract]
            extract = flat
            # Extracting sentence specific features
            company = company_in_sentence(curr)
            stock = stock_in_sentence(curr)
            shares = shares_in_sentence(curr)
            trade = trade_in_sentence(curr) 
            for item in extract:
                new = space_remove(item)
                words = re.split(r' ', new)
                # Extracting candidate specific features
                length = length_of_company(new)
                plural = plural_word(new)
                location = location_in_sentence(curr, new)
                # Extracting constiuent specific features
                number_words = number_of_words(words)
                stop = stop_word(words)
                corp = company_words(words)
                candidates.append([new, company, stock, shares, trade, length, plural, location, number_words,
                                   stop, corp])
    return candidates

In [37]:
candidates = feature_creator_companies(sentences)

In [38]:
df = pd.DataFrame(candidates, columns = ['candidate', 'company', 'stock', 
                                         'shares', 'trade', 'length', 'plural', 'location',
                                         'number_words', 'stop', 'corp'])

## Model Training and Testing

In [39]:
# Adding Labels to company df
labels = []
candidate = df['candidate'].tolist()
values = set(company_df['Company'].tolist())
for person in candidate:
    if person in values: 
        labels.append(1)
    else: labels.append(0)
df['label'] = labels

In [40]:
# Standardizing data
features = ['company', 'stock', 'shares', 'trade', 'length',
            'number_words', 'stop', 'corp']
scaler = StandardScaler()
X = df[features].copy()
scaler.fit(X)
X[features] = scaler.fit_transform(X[features])
y = df['label']

In [41]:
# Fitting model to training data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.50) 
log = LogisticRegression()
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
# Predicting testing data
results = log.predict(X_test)
metrics.accuracy_score(results, y_test)

0.9305887533023022

In [43]:
correct = X_test.copy()
correct['predict'] = pd.Series(results)
correct['label'] = y_test
label1 = correct[correct['label'] == 1].shape[0]
result1 = correct[correct['predict'] == 1].shape[0]
print(f'There are {label1} companies according to the labels and {result1} candiates were classified as companies')

There are 52540 companies according to the labels and 663 candiates were classified as companies


## Extracting Entities

In [44]:
# Predicting on entire dataset
feature_space = df.copy()
feature_space[features] = scaler.fit_transform(feature_space[features])
predictions = log.predict(feature_space[features])

In [45]:
# Getting unique entities
df['predict'] = predictions
results = set(df[df['predict'] == 1].candidate.tolist())

In [46]:
# Writing to file
with open('company_names', 'w') as f:
    for company in results:
        f.write(company+'\n')

# Percentages 

### Sentence Specific Features

In [47]:
# Determines if the word 'percent' follows another word in a sentence
def percent_word(word, sentence):
    ret = 0
    try:
        percent_starts = re.search(re.compile(word), sentence).start() + len(word) + 1
        if percent_starts == re.search(r'percent', sentence.lower()).start():
            ret = 1
    except: pass
    return ret

In [48]:
# Determines if the '%' symbole follows another word in a sentence
def percent_symbol(word, sentence):
    ret = 0
    try:
        pot = re.search(re.compile(word), sentence).start() + len(word)
        if sentence[pot] == '%' or sentence[pot+1] == '%':
            ret = 1
    except: pass
    return ret

In [49]:
# Creating regex patterns for numbers
numbers = re.compile('[0-9]+\.?[0-9]+')
number_word = re.compile('one|two|three|four|five')
number_word_teens = re.compile('fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|zero|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen')
number_word_tens = re.compile(r'((twenty|thirty|fourty|fifty|sixty|seventy|eighty|ninety)(-|\s)?(one|two|three|four|five|six|seven|eight|nine)?)')

In [50]:
# Extracting candidates and creating feature space
def feature_creator_percentage(sentences):
    candidates = []
    for sent in sentences:
        extract = []
        extract = re.findall(numbers, sent) + re.findall(number_word, sent) + re.findall(number_word_teens, sent) + [item[0] for item in re.findall(number_word_tens, sent)]
        if extract != []:
            for number in extract:
                word_after = percent_word(number, sent) 
                symbol_after = percent_symbol(number, sent)
                candidates.append([number, word_after, symbol_after])
    return candidates

In [51]:
candidates = feature_creator_percentage(sentences)

In [52]:
df = pd.DataFrame(candidates, columns=['candidate','percent', '%'])

In [53]:
# Adding Labels to percentage df
labels = []
lst = df['candidate'].tolist()
values = set(percent_df['percent'].tolist())
for item in range(df.shape[0]):
    if lst[item] in values: labels.append(1)
    else: labels.append(0)
df['label'] = labels

In [54]:
# Standardizing training data
features = ['percent', '%']
scaler = StandardScaler()
X = df[features].copy()
scaler.fit(X)
X[features] = scaler.fit_transform(X[features])
y = df['label']

In [55]:
# Partioning training/testing data and training model
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.50) 
log = LogisticRegression()
log.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [56]:
# Predicting model on testing data
results = log.predict(X_test)
metrics.accuracy_score(results, y_test)

0.7727492416389502

In [57]:
correct = X_test.copy()
correct['predict'] = pd.Series(results)
correct['label'] = y_test
label1 = correct[correct['label'] == 1].shape[0]
result1 = correct[correct['predict'] == 1].shape[0]
print(f'There are {label1} percentages according to the labels and {result1} candiates were classified as percentages')

There are 96518 percentages according to the labels and 14531 candiates were classified as percentages


## Extracting Entities

In [58]:
# Predicting model on all data
feature_space = df.copy()
feature_space[features] = scaler.fit_transform(feature_space[features])
predictions = log.predict(feature_space[features])

In [59]:
# Extracting 'all' entities (as opposed to just unique ones)
df['predict'] = predictions
results = df[df['predict'] == 1].candidate.tolist()

In [60]:
# Writing to file
with open('percentages', 'w') as f:
    for percent in results:
        f.write(percent+'\n')