In [1]:
import pandas as pd
import spacy
import string
from tqdm import tqdm

In [2]:
org_df = pd.read_csv('data/labeled_org_data.csv')
person_df = pd.read_csv('data/labeled_person_data.csv')

In [3]:
from sklearn.model_selection import train_test_split
# train test split data
X_org_train, X_org_test, y_org_train, y_org_test = train_test_split(org_df.drop('y', axis=1), 
                                                                    org_df.y, test_size=0.25)

In [77]:
X_per_train, X_per_test, y_per_train, y_per_test = train_test_split(person_df.drop('y', axis=1), person_df.y, 
                                                                    test_size=0.25)

In [234]:
org_df.describe()

Unnamed: 0,y,num_cap,num_words,has_quote,best_company,contain_company
count,303707.0,303707.0,303707.0,303707.0,303707.0,303707.0
mean,0.380752,6.821621,25.135446,0.407854,0.020638,0.042837
std,0.485573,9.594531,18.284143,0.491437,0.14217,0.202491
min,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,3.0,16.0,0.0,0.0,0.0
50%,0.0,5.0,23.0,0.0,0.0,0.0
75%,1.0,8.0,32.0,1.0,0.0,0.0
max,1.0,1863.0,2205.0,1.0,1.0,1.0


In [235]:
person_df.describe()

Unnamed: 0,y,num_cap,num_words,has_quote,best_ceo,contain_ceo
count,199827.0,199827.0,199827.0,199827.0,199827.0,199827.0
mean,0.12765,7.37013,25.84367,0.471052,0.007186,0.027969
std,0.333701,10.587218,21.795162,0.499163,0.084467,0.164885
min,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,3.0,16.0,0.0,0.0,0.0
50%,0.0,5.0,24.0,0.0,0.0,0.0
75%,0.0,9.0,33.0,1.0,0.0,0.0
max,1.0,1863.0,2205.0,1.0,1.0,1.0


## Modeling Companies

### TF-IDF Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_org = TfidfVectorizer(ngram_range=(1,3), 
                           stop_words='english',
                           max_features=10000).fit(X_org_train.sentences)

In [6]:
# tfidf transformation for org train and test data
from scipy.sparse import hstack
org_text_train = tfidf_org.transform(X_org_train.sentences)
org_text_test = tfidf_org.transform(X_org_test.sentences)

In [7]:
X_org_train = hstack([X_org_train.drop("sentences", axis=1), org_text_train])
X_org_test = hstack([X_org_test.drop("sentences",axis=1), org_text_test])

In [8]:
X_org_train.shape

(227780, 10005)

In [9]:
X_org_test.shape

(75927, 10005)

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
lr_org = LogisticRegression(max_iter=5000, class_weight='balanced')
lr_org.fit(X_org_train, y_org_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [64]:
y_train_pred = lr_org.predict(X_org_train)

In [65]:
print(classification_report(y_org_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95    141048
           1       0.94      0.90      0.92     86732

    accuracy                           0.94    227780
   macro avg       0.94      0.93      0.93    227780
weighted avg       0.94      0.94      0.94    227780



In [66]:
y_test_pred = lr_org.predict(X_org_test)
print(classification_report(y_org_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95     47022
           1       0.94      0.88      0.91     28905

    accuracy                           0.93     75927
   macro avg       0.93      0.92      0.93     75927
weighted avg       0.93      0.93      0.93     75927



In [13]:
y_org_test[y_org_test == y_test_pred].index

Int64Index([272316, 263443,  21431,  45020, 278645,  81076,  63552,  68047,
            160333,  31317,
            ...
              4201, 186587,  69422, 132796,  85487, 155410,   7081, 293815,
            199699, 167087],
           dtype='int64', length=70726)

### True Positives

In [169]:
companies_train = y_org_train[(y_org_train == y_train_pred) & (y_org_train ==1)]

In [170]:
companies_test = y_org_test[(y_org_test == y_test_pred) & (y_org_test == 1)]

In [171]:
true_pos_indices = companies_train.index.append(companies_test.index).to_list()

In [175]:
company_df = org_df.iloc[true_pos_indices]

In [176]:
company_sentences = company_df.sentences.to_list()

In [178]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])
company_processed = [nlp(sentence) for sentence in company_sentences]

In [233]:
from spacy import displacy
displacy.render(company_processed[5], style='ent')

In [181]:
def return_companies(sentence_spacified):
    company = [ent.text for ent in sentence_spacified.ents if ent.label_ == "ORG"]
    return company

In [182]:
company_lst = []
for sent in company_processed:
    companies = return_companies(sent)
    company_lst.extend(companies)

In [193]:
unique_companies = list(set(company_lst))

In [225]:
## writing these to a csv and calling it good. 
companies_final = pd.DataFrame({'companies': unique_companies})
companies_final.to_csv("results/company_matches.csv",index=False)

In [255]:
companies_final.shape

(31955, 1)

### False Positives

In [217]:
conf_train = lr_org.decision_function(X_org_train)
compFP_train = y_org_train[(y_org_train != y_train_pred) & (y_train_pred ==1) & (conf_train >= 2.0)]

In [218]:
conf_test = lr_org.decision_function(X_org_test)
compFP_test = y_org_test[(y_org_test != y_test_pred) & (y_test_pred == 1) & (conf_test > 2.0)]

In [219]:
company_FP = compFP_train.index.append(compFP_test.index).to_list()

In [220]:
company_FP_df = org_df.iloc[company_FP]

In [221]:
company_FP_processed = [nlp(sentence) for sentence in company_FP_df.sentences.to_list()]

In [222]:
company_FP_lst = []
for sent in company_FP_processed:
    company_FP = return_companies(sent)
    company_FP_lst.extend(company_FP)

In [267]:
company_FP_lst = list(set(company_FP_lst))

In [268]:
len(company_FP_lst)

839

In [269]:
companyDF_fp = pd.DataFrame({'company': company_FP_lst})
companyDF_fp.to_csv('results/companies_hiconf_FP.csv', index=False)

### Random Forest Classifier

In [14]:
# from sklearn.ensemble import RandomForestClassifier
# rf_org = RandomForestClassifier(n_jobs=-1, n_estimators=128,
#                                 min_samples_split=0.001,
#                                 max_samples=0.5,
#                                verbose=1)
# rf_org.fit(X_org_train, y_org_train)
# orgPredsTrain_rf = rf_org.predict(X_org_train)

In [15]:
# print(classification_report(orgPredsTrain_rf, y_org_train))

In [16]:
# orgPredsTest_rf = rf_org.predict(X_org_test)
# print(classification_report(y_org_test, orgPredsTest_rf))

## Modeling CEOs

In [78]:
tfidf_per =  TfidfVectorizer(ngram_range=(1,3), 
                           stop_words='english',
                            max_features=10000).fit(X_per_train.sentences)

In [79]:
per_text_train = tfidf_per.transform(X_per_train.sentences)
per_text_test = tfidf_per.transform(X_per_test.sentences)
X_per_train = hstack([X_per_train.drop('sentences',axis=1), per_text_train])
X_per_test = hstack([X_per_test.drop('sentences',axis=1), per_text_test])

In [80]:
lr_per = LogisticRegression(max_iter=5000, class_weight='balanced')
lr_per.fit(X_per_train, y_per_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [81]:
ceo_train_pred = lr_per.predict(X_per_train)

In [82]:
print(classification_report(y_per_train, ceo_train_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97    130807
           1       0.75      0.90      0.82     19063

    accuracy                           0.95    149870
   macro avg       0.87      0.93      0.90    149870
weighted avg       0.96      0.95      0.95    149870



In [83]:
ceo_test_pred = lr_per.predict(X_per_test)
print(classification_report(y_per_test, ceo_test_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96     43512
           1       0.72      0.86      0.78      6445

    accuracy                           0.94     49957
   macro avg       0.85      0.90      0.87     49957
weighted avg       0.95      0.94      0.94     49957



In [84]:
y_per_test.mean()

0.12901094941649818

### Gradient Boosting Classifier

In [106]:
from sklearn.ensemble import GradientBoostingClassifier

In [154]:
gbc = GradientBoostingClassifier(verbose=1, learning_rate=0.6, n_estimators=100)
gbc.fit(X_per_train, y_per_train)

      Iter       Train Loss   Remaining Time 
         1           0.6507            1.80m
         2           0.5998            1.68m
         3           0.5622            1.63m
         4           0.5305            1.58m
         5           0.5075            1.54m
         6           0.4916            1.49m
         7           0.4781            1.47m
         8           0.4635            1.43m
         9           0.4508            1.41m
        10           0.4403            1.38m
        20           0.3511            1.19m
        30           0.2979            1.03m
        40           0.2681           52.78s
        50           0.2531           44.00s
        60           0.2450           35.42s
        70           0.2381           26.49s
        80           0.2332           17.70s
        90           0.2292            9.11s
       100           0.2251            0.00s


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.6, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=1,
                           warm_start=False)

In [155]:
y_train_gbm = gbc.predict(X_per_train)

In [156]:
print(classification_report(y_per_train, y_train_gbm))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98    130807
           1       0.93      0.83      0.88     19063

    accuracy                           0.97    149870
   macro avg       0.95      0.91      0.93    149870
weighted avg       0.97      0.97      0.97    149870



In [157]:
y_test_gbm = gbc.predict(X_per_test)
print(classification_report(y_per_test, y_test_gbm))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98     43512
           1       0.91      0.80      0.85      6445

    accuracy                           0.96     49957
   macro avg       0.94      0.89      0.91     49957
weighted avg       0.96      0.96      0.96     49957



### True Positives

In [244]:
ceos_train = y_per_train[(y_per_train == y_train_gbm) & (y_per_train == 1)]

In [245]:
ceos_test= y_per_test[(y_per_test == y_test_gbm) & (y_per_test == 1)]

In [246]:
ceos_idx = ceos_train.index.append(ceos_test.index).to_list()

In [247]:
ceo_df = person_df.iloc[ceos_idx]

In [249]:
ceo_sentences = [nlp(sent) for sent in ceo_df.sentences.to_list()]

In [250]:
def return_ceos(sentence_spacified):
    ceo = [ent.text for ent in sentence_spacified.ents if ent.label_ == "PERSON"]
    return ceo

In [251]:
ceo_lst = []
for sent in ceo_sentences:
    ceo = return_ceos(sent)
    ceo_lst.extend(ceo)

In [253]:
ceo_lst = list(set(ceo_lst))
ceo_df = pd.DataFrame({'ceo': ceo_lst})
ceo_df.to_csv('results/ceo_matches.csv')

In [254]:
ceo_df.shape

(9374, 1)

### False Positives

In [256]:
ceo_conf_train = gbc.decision_function(X_per_train)
ceosFP_train = y_per_train[(y_per_train != y_train_gbm) & (y_train_gbm==1) & (ceo_conf_train >= 2.0)]

In [257]:
ceo_conf_test = gbc.decision_function(X_per_test)
ceosFP_test = y_per_test[(y_per_test != y_test_gbm) & (y_test_gbm==1) & (ceo_conf_test >= 2.0)]

In [258]:
ceosFP_idx = ceosFP_train.index.append(ceosFP_test.index).to_list()

In [259]:
ceoFP_df = person_df.iloc[ceosFP_idx]

In [261]:
ceoFP_sent = [nlp(sent) for sent in ceoFP_df.sentences.to_list()]

In [264]:
ceoFP_lst =[]
for sent in ceoFP_sent:
    ceoFP = return_ceos(sent)
    ceoFP_lst.extend(ceoFP)

In [271]:
ceoFP_lst = list(set(ceoFP_lst))

In [272]:
len(ceoFP_lst)

225

In [273]:
ceo_hiconf_FP = pd.DataFrame({'ceo': ceoFP_lst})
ceo_hiconf_FP.to_csv('results/ceo_hiconf_FP.csv', index=False)