# TS SCI Classification Model 

## edited: 5 July 23

## author: Angus Ferrell

### 1. Initiliaze functions

In [587]:
# Load general utilities
# ----------------------
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.axes as ax
import datetime
import numpy as np
import pickle
import time
import seaborn as sns
import statistics as stat
import random
import glob
import os
import re

# Load sklearn utilities
# ----------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve, brier_score_loss, mean_squared_error, r2_score, recall_score, precision_score,f1_score
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report

# Load classifiers
# ----------------
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier

# Other Packages
# --------------
from scipy.stats import kendalltau
from sklearn.cluster import KMeans
from io import StringIO
from IPython.display import Image  
from sklearn.tree import export_graphviz
from scipy.interpolate import BSpline

# Load debugger, if required
#import pixiedust
pd.options.mode.chained_assignment = None #'warn'

# suppress all warnings
import warnings
warnings.filterwarnings("ignore")

#set random_state
random_state=2

In [506]:
# Check Job Title
# Imported from TECH VIP file

file_name = 'TECH VIP results.csv'
column_to_filter = 'title_1'

def check_jobtitle(data):
    #data = pd.read_csv(file_name, engine = 'python', error_bad_lines = False)
    
    pos = pd.read_csv('TECH VIP/positive ts.csv').dropna(subset=['Keyword'])
    neg = pd.read_csv('TECH VIP/negative.csv').dropna(subset=['Keyword'])

    pos_keys = pos.get('Keyword').tolist()
    pos_keys = [item.lower().strip() for item in pos_keys]


    neg_keys = neg.get('Keyword').tolist()
    neg_keys = [item.lower().strip() for item in neg_keys]


    it_keys = ['izzt ', ' izzt', ' izzt '] #Get IT titles
    
    pos_keys.append('izzt ')
    pos_keys.append(' izzt')
    pos_keys.append(' izzt ')

    pos_keys.append('cto ')
    pos_keys.append(' cto')
    pos_keys.append(' cto ')

    pp = "|".join(pos_keys)
    nn = "|".join(neg_keys)
    
    df = data
    specified_column_1 = column_to_filter #CHANGE SPECIFIED COLUMN HEADER FOR FILTER
    df[specified_column_1] = df[specified_column_1].fillna('')
    #df = df.dropna(subset=[specified_column_1])

    pos_index = []
    neg_index = []

    for col in df[specified_column_1].items():
        pattern = re.compile(pp)
        stri = col[1].lower()
        find_pat = pattern.findall(stri,0,len(stri))
        if len(find_pat) >= 1:
            pos_index.append(col[0])

    #df['job title_index'] = df.loc[pos_index] 
    
    data['job_index'] = 0  # Create a new column 'job_index' initialized with 0
    data.loc[pos_index, 'job_index'] = 1 
    
    for col in df[specified_column_1].items():
        pattern = re.compile(nn)
        stri = col[1].lower()
        find_pat = pattern.findall(stri,0,len(stri))
        if len(find_pat) >= 1:
            neg_index.append(col[0])
            
    data.loc[neg_index, 'job_index'] = -1 
    
    print('Job title - number of pos matches: ', len(pos_index))
    #print('Job title - number of neg matches: ', len(neg_index))
     

In [507]:
# Check company

company_list = ['General Dynamics Information Technology', 'Gridiron IT Solutions', 'SAIC', 'SPA', 
                'Institute For Defense Analyses', 'IC-CAP', 'Peraton', 'ManTech', 'Tyto Athene LLC',
                'Boeing Intelligence & Analytics', 'Arcfield', 'Leidos', 'Raytheon Technologies', 'CACI', 
                'Booz Allen Hamilton', 'Lockheed Martin', 'ALTA IT Services', 'Cyber Defense Technologies', 
                'TekMasters', 'Emerald Technical Solutions', 'Modern Technology Solutions Inc.', 'VTG', 
                'MITRE Corporation', 'SRG Government Services', 'ManTech International', 'MAXAR Technologies',
                'Leidos', 'Parsons', 'BAE Systems', 'Marathon TS Inc', 'Guidehouse', 'SilverEdge', 'MC Dean Inc', 
                'Trevity LLC', 'teKnoluxion Consulting', 'Northrop Grumman', 'Sentar Inc', 'Stellar Solutions', 
                'Criterion Systems Inc', 'IntelliGenesis LLC', 'LMI Government Consulting', 'MAG Aero', 
                'HII Mission Technologies', 'Enlighten', 'In Technology Group Limited', 'ProSync Technology Group',
                'Connsci', 'Signature Federal Systems', 'Apex Systems', 'Buchanan and Edwards',
                'Radiance Technologies', 'Noblis', 'SRG Government Services', 'Xcelerate Solutions', 
                'DCS Corporation', 'Associates Systems LLC', 'BlueHalo', 'Strategic Alliance Consulting Inc',
                'IT Concepts Inc', 'Barbaricum', 'Forcepoint', 'Federal IT Consulting', 'Canvas Inc', 
                'TEKsystems c/o Allegis Group', 'iota IT', 'Illuminate Operations Inc', 'Compass Inc', 
                'Applied Research Solutions', 'Avineon Inc', 'I3 LLC', 'Assured Information Security Inc',
                'hyrUP', 'Millennium Corporation', 'Core One', 'TekSynap', 'Base-2 Solutions LLC', 'AERMOR', 
                'Amentum', 'Cornerstone Defense', 'Base One Technologies', 'ALTA IT Services', 'Sentar Inc', 
                'Ennoble First', 'Core4ce', 'Syntelligent Analytic Solutions LLC', 'Serco Inc', 
                'Space Dynamics Laboratory', 'RDR Inc', 'MicroTech LLC', 'North Point Technology LLC', 
                'LinQuest Corporation', 'Kavaliro', 'SOSi', 'Allen Integrated Solutions LLC',
                'Columbia Technology Partners', 'Dark Wolf Solutions', 'Strategic Alliance Consulting Inc',
                'MAXAR Technologies', 'AT&T Government Solutions', 'Invictus International Consulting', 
                'Secure Halo', 'Octo', 'Lumen', 'Applied Insight', 'PeopleTec', 'NASK', 'Highlight Technologies',
                'JCTM Joint Computer Technologies & Training Management', 'ClearEdge IT Solutions LLC',
                'Echelon Services LLC', 'Clear Resolution Consulting LLC', 'American Systems Corporation', 
                'Abile Group', 'KBR', 'Abbtech Professional Resources Inc', 'Stanley Reid & Company', 
                'Stillwater Human Capital', 'Fluor Corporation', 'Chenega Corporation', 'RMGS Inc', 
                'Moseley Technical Services Inc', 'Personnel Impact Inc', 'Rividium Inc', 'Edgesource',
                'Sunayu LLC', 'SSATI', 'Distributed Solutions Inc', 'Information Gateways Inc', 'ENSCO Inc',
                'Independent Software', 'Avantus Federal', 'Strategic ASI', 'IOMAXIS LLC', 
                'StraCon Services Group LLC', 'Railhead Inc', 'CONNEXIONS FEDERAL SERVICES', 
                'Valiant Integrated Services', 'Intelligent Waves', 'Maximus Inc', 'Thresher Corporation',
                'Market Street Talent', 'Odyssey Systems Consulting Group', 'Quantum Research International Inc',
                'Dezign-Concepts', 'The DarkStar Group', 'Red River Technology LLC', 'Mission Box Solutions',
                'Executive Management Services LLC', 'The Global Edge Consultants', 'TRIAEM LLC',
                'Delta Solutions and Strategies LLC', 'Intrepid Solutions and Services LLC', 'FiveTwelve LLC',
                'CACI International Inc', 'Raytheon', 'Deloitte','Oshkosh', 'Boeing','Accenture Federal Services',
                'Palantir Technologies','US Army','US Navy','United States Air Force','Department of Defense','USAF']


#'Microsoft','Amazon', 'Google','Amazon Web Services (AWS)']

military_pattern = r'US Army|US Navy|air force|usaf|marines|special operations|special forces| socom |Department of Defense| DoD '

#gov_pattern = r'Central Intelligence Agency|CIA|National Security Agency|NSA|Federal Bureau of Investigation|FBI|Department of State|Department of Homeland Security|DHS|Defense Intelligence Agency|DIA|National Reconnaissance Office|NRO|National Geospatial-Intelligence Agency|NGA|United States Secret Service|Drug Enforcement Administration|DEA|National Counterterrorism Center|NCTC|Bureau of Alcohol, Tobacco, Firearms, and Explosives|ATF|Department of Energy|DOE|Department of Justice|DOJ'
                    # FBI | NSA | DOJ | CIA | DHS | DIA 


def check_company(df):

    # Fill missing values with an empty string
    df['company_1'] = df['company_1'].fillna('NaN')
    df['company_2'] = df['company_2'].fillna('NaN')
    df['company_1'] = df['company_1'].replace('U.S. Navy', 'US Navy')
    df['company_2'] = df['company_2'].replace('U.S. Navy', 'US Navy')
    df['company_1'] = df['company_1'].replace('U.S. Army', 'US Army')
    df['company_2'] = df['company_2'].replace('U.S. Army', 'US Army')
    df['company_1'] = df['company_1'].replace('CACI', 'CACI International Inc')
    df['company_2'] = df['company_2'].replace('CACI', 'CACI International Inc')
    df['company_1'] = df['company_1'].replace('Raytheon', 'Raytheon Technologies')
    df['company_2'] = df['company_2'].replace('Raytheon', 'Raytheon Technologies')
    df['company_1'] = df['company_1'].replace('Northrop Grumman Corporation', 'Northrop Grumman')
    df['company_2'] = df['company_2'].replace('Northrop Grumman Corporation', 'Northrop Grumman')
    df['company_1'] = df['company_1'].replace('Palantir', 'Palantir Technologies')
    df['company_2'] = df['company_2'].replace('Palantir', 'Palantir Technologies')
    
    
    #Apply regex pattern to filter DataFrame
    df['company_index'] = df.apply(lambda row: int(1) if row['company_1'] in company_list or 
                                   re.search(military_pattern, row['company_1'], re.IGNORECASE) 
                                   else int(0), axis=1)
    
    #df['company_index'] = df.apply(calculate_company_index, axis=1)
    print('Company - Number of matches: ', np.sum(df.company_index==1))

In [508]:
# Check skills

# Regex pattern
skills_pattern = r'TS CLEARANCE|TS/SCI|TOP SECRET CLEARANCE|ACTIVE TS CLEARANCE| TS SCI |TS SCI CLEARANCE|POLYGRAPGH| CI POLY|FULL SCOPE POLY'

def check_skills(df):

    # Fill missing values with an empty string
    df['Skills'] = df['Skills'].fillna('')
    
     # Apply regex pattern to filter DataFrame
    df['skills_index'] = df.apply(lambda row: 1 if re.search(skills_pattern, row['Skills'], 
                                                             re.IGNORECASE) else 0, axis=1)

    print('Skills - Number of matches: ', np.sum(df.skills_index==1))

In [509]:
# Check summary

def check_summary(df):
    
    # Fill missing values with an empty string
    df['Summary'] = df['Summary'].fillna('')

    # Apply regex pattern to filter DataFrame
    df['summary_index'] = df.apply(lambda row: 1 if re.search(skills_pattern, row['Summary'], 
                                                              re.IGNORECASE) else 0, axis=1)

    print('Summary - Number of matches: ', np.sum(df.summary_index==1))
    
    

In [510]:
# Check job description


def check_desc(df):
    
    # Fill missing values with an empty string
    df['Job_Description_1'] = df['Job_Description_1'].fillna('')
    #df['Job_Description_2'] = df['Job_Description_2'].fillna('')

    # Apply regex pattern to filter DataFrame
    df['desc_index'] = df.apply(lambda row: 1 if re.search(skills_pattern, row['Job_Description_1'], 
                                                              re.IGNORECASE) else 0, axis=1)

    print('Description - Number of matches: ', np.sum(df.desc_index==1))
    
    

In [511]:
# Check location based on states with highest liklihood of TS 

popular_states = ['Virginia','Maryland','California','District of Columbia','Florida','Colorado','Utah','Texas',
                 'Alabama','New Mexico','Ohio','Georgia','Hawaii', 'North Carolina','South Carolina', 'Arizona',
                  'Washington']


def check_location(df):
    
     # Fill missing values with an empty string
    df['State'] = df['State'].fillna('')

    # Apply regex pattern to filter DataFrame
    df['location_index'] = df.apply(lambda row: 1 if row['State'] in popular_states else 0, axis=1)
    
   
    print('Location - Number of matches: ', np.sum(df.location_index==1))

    

In [512]:
# Check names
# using 2000 most common US census data names

common_names = pd.read_csv('common_names.csv').dropna(subset=['Keyword'])

name_list = []

for name in common_names.Keyword:
    name_list.append(name)


def check_name(df):
    
     # Fill missing values with an empty string
    df['First Name'] = df['First Name'].fillna('')

    # Apply regex pattern to filter DataFrame
    df['name_index'] = df.apply(lambda row: 1 if row['First Name'].upper() in name_list else 0, axis=1)
    
   
    print('Names - Number of matches: ', np.sum(df.name_index==1))

In [637]:

#this code creates an equal number of positive and random cases from the imported dataset
#then creates the respective class labels
#returns a training data dataframe and labels array

def create_trainingdata(data,random_state = random_state):
    
    #check_skills(data)
    #check_summary(data)
    #check_desc(data)

    #summary_df = data[data.summary_index == 1]
    #skills_df = data[data.skills_index == 1]
    #desc_df = data[data.desc_index == 1]

    #training_data = pd.concat([summary_df, skills_df, desc_df])
    #training_data = training_data[columns]
    
    columns = ['Linkedin_url','Summary', 'Skills', 'title_1',
       'company_1', 'time_duration_1', 'Job_Description_1', 'title_2',
       'company_2', 'time_duration_2', 'Job_Description_2', 
       'First Name','Middle Name', 'Surname', 'City', 'State', 'Country']
    
    training_data = check_for_keywords(data)
    training_data = training_data[columns]
    
    val_labels = np.ones(len(training_data))

    num_numbers = len(training_data)
    min_num = 1
    max_num = len(data)

    training_data_indices = training_data.index
    
    random_generator = random.Random(random_state)

    '''
    random_numbers = []
    while len(random_numbers) < num_numbers:
        random_num = random_generator.randint(min_num, max_num)
        if random_num not in training_data_indices:
            random_numbers.append(random_num)
     '''       

    random_numbers = []
    target_length = 4 * num_numbers

    while len(random_numbers) < target_length:
        random_numbers.append(random_generator.randint(min_num, max_num))
        

    random_data_df = data.loc[random_numbers, columns]
    random_data_indices = random_data_df.index

    random_labels = np.zeros(len(random_numbers))

    training_data_df = pd.concat([training_data, random_data_df])
    training_data_labels = np.append(val_labels, random_labels)

    x_unlabeled_indices = data.index.difference(training_data_indices.union(random_data_indices))
    x_unlabeled_df = data.loc[x_unlabeled_indices, columns]

    return training_data_df, training_data_labels, x_unlabeled_df


In [514]:

def check_for_keywords(df):
    
    keywords = ['TS CLEARANCE','TS/SCI','TOP SECRET CLEARANCE','ACTIVE TS CLEARANCE', 'TS SCI','TS SCI CLEARANCE','POLYGRAPH','CI POLY','FULL SCOPE POLY']
    keyword_rows = []

    for index, row in df.iterrows():
        for column in df.columns:
            cell_value = str(row[column]).upper()
            if any(keyword in cell_value for keyword in keywords):
                keyword_rows.append(index)
                break  # Move to the next row if keyword found in any column

    print('# matches: ',len(keyword_rows))   
    
    output = df.iloc[keyword_rows]
    
    return output



In [515]:
# Define a function that, given a CVGridSearch object, finds the
# percentage difference between the best and worst scores
def find_score_variation(cv_model):
    all_scores = cv_model.cv_results_['mean_test_score']
    return( np.abs((max(all_scores) - min(all_scores))) * 100 / max(all_scores) )


# Define a function that checks, given a CVGridSearch object,
# whether the optimal parameters lie on the edge of the search
# grid
def find_opt_params_on_edge(cv_model):
    out = False
    
    for i in cv_model.param_grid:
        if cv_model.best_params_[i] in [ cv_model.param_grid[i][0], cv_model.param_grid[i][-1] ]:
            out = True
            break
            
    return out


default_seed = 0
output_file = "output_sample"

# Create a function to print a line to our output file

def dump_to_output(key, value):
    with open(output_file, "a") as f:
        f.write(",".join([str(default_seed), key, str(value)]) + "\n")

In [516]:
def fit_classification(model, X_train,y_train, X_test, y_test,
                          cv_parameters = {},
                          model_name = None,
                          random_state = 0,
                          output_to_file = True,
                          print_to_screen = True):
    
    np.random.seed(random_state)

    # --------------------------
    #   Step 2 - Fit the model
    # --------------------------

    cv_model = GridSearchCV(model, cv_parameters)
    
    start_time = time.time()
    cv_model.fit(X_train, y_train)
    end_time = time.time()
    
    best_model = cv_model.best_estimator_
    
    if print_to_screen:

        if model_name != None:
            print("=========================================================")
            print("  Model: " + model_name)
            print("=========================================================")

        print("Fit time: " + str(round(end_time - start_time, 2)) + " seconds")
        print("Optimal parameters:")
        print(cv_model.best_params_)
        print("")
    
    # -------------------------------
    #   Step 3 - Evaluate the model
    # -------------------------------
    
    # If possible, make probability predictions
    try:
        y_pred_probs = best_model.predict_proba(X_test)[:,1]
        fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs)
        
        probs_predicted = True
    except:
        probs_predicted = False
    
    # Make predictions; if we were able to find probabilities, use
    # the threshold that maximizes the accuracy in the training set.
    # If not, just use the learner's predict function
    if probs_predicted:
        y_train_pred_probs = best_model.predict_proba(X_train)[:,1]
        fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_pred_probs)
        
        true_pos_train = tpr_train*(y_train.sum())
        true_neg_train = (1 - fpr_train) *(1-y_train).sum()
        
        best_threshold_index = np.argmax(true_pos_train + true_neg_train)
        best_threshold = 1 if best_threshold_index == 0 else thresholds_train[ best_threshold_index ]
        
        if print_to_screen:
            print("Accuracy-maximizing threshold was: " + str(best_threshold))
        
        y_pred = (y_pred_probs > best_threshold)
    else:
        y_pred = best_model.predict(X_test)
    
    if print_to_screen:
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print(classification_report(y_test, y_pred, target_names =['No default', 'Default'], digits = 4))

    if print_to_screen:
        if probs_predicted:        
            plt.figure(figsize = (13, 4.5))
            plt.subplot(2, 2, 1)

            plt.title("ROC Curve (AUC = %0.2f)"% roc_auc_score(y_test, y_pred_probs))
            plt.plot(fpr, tpr, 'b')
            plt.plot([0,1],[0,1],'r--')
            plt.xlim([0,1]); plt.ylim([0,1])
            plt.ylabel('True Positive Rate')
            plt.xlabel('False Positive Rate')

            plt.subplot(2, 2, 3)

            plt.plot(thresholds, tpr, 'b', label = 'Sensitivity')
            plt.plot(thresholds, 1 -fpr, 'r', label = 'Specificity')
            plt.legend(loc = 'lower right')
            plt.xlim([0,1]); plt.ylim([0,1])
            plt.xlabel('Threshold')

            plt.subplot(2, 2, 2)

            fp_0, mpv_0 = calibration_curve(y_test, y_pred_probs, n_bins = 10)
            plt.plot([0,1], [0,1], 'k:', label='Perfectly calibrated')
            plt.plot(mpv_0, fp_0, 's-')
            plt.ylabel('Fraction of Positives')
            plt.xlim([0,1]); plt.ylim([0,1])
            plt.legend(loc ='upper left')
            
            plt.subplot(2, 2, 4)
            plt.hist(y_pred_probs, range=(0, 1), bins=10, histtype="step", lw=2)
            plt.xlim([0,1]); plt.ylim([0,20000])
            plt.xlabel('Mean Predicted Probability')
            plt.ylabel('Count')
            
            #plt.tight_layout()
            plt.show()
        
    # Additional Score Check
    if probs_predicted:
        y_train_score = y_train_pred_probs
    else:
        y_train_score = best_model.decision_function(X_train)
        
    #tau, p_value = kendalltau(y_train_score, data.grade[filter_train])
    #if print_to_screen:
        #print("")
        #print("Similarity to LC grade ranking: ", tau)
    
    if probs_predicted:
        brier_score = brier_score_loss(y_test, y_pred_probs)
        if print_to_screen:
            print("Brier score:", brier_score)
    
    # Return the model predictions, and the
    # test set
    # -------------------------------------
    out = {'model':best_model, 'y_pred_labels':y_pred}
    
    if probs_predicted:
        out.update({'y_pred_probs':y_pred_probs})
    else:
        y_pred_score = best_model.decision_function(X_test)
        out.update({'y_pred_score':y_pred_score})
 
    return out

### 2. Build Training Dataset

In [574]:
training_files = glob.glob('updated_datasets/*.csv')
print(training_files)

['updated_datasets/updated_lifull_12.2022.csv', 'updated_datasets/updated_lifull_419M3.csv', 'updated_datasets/updated_lifull_419_M2.csv', 'updated_datasets/updated_more_training_data.csv', 'updated_datasets/updated_lifull_419M1.csv', 'updated_datasets/updated_lifull_bing2_212.csv', 'updated_datasets/updated_lifull_322_0.csv', 'updated_datasets/updated_li_full_11_22.csv', 'updated_datasets/updated_lifull_303.csv', 'updated_datasets/updated_lifull_326-001.csv', 'updated_datasets/updated_lifull_bing1_212.csv', 'updated_datasets/updated_lifull_326-000.csv', 'updated_datasets/updated_lifull_326-002.csv', 'updated_datasets/updated_lifull_reg_212.csv', 'updated_datasets/updated_lifull_1_2023.csv', 'updated_datasets/updated_lifull_309_1.csv', 'updated_datasets/updated_lifull_309_2_3.csv']


In [638]:
training_df = []
labels = []
unlabeled_list = []

for filename in training_files:
    # Call create_trainingdata function
    data = pd.read_csv(filename) 
    df, label, unlabeled = create_trainingdata(data,random_state)
    
    # Append dataframe and label to respective lists
    training_df.append(df)
    labels.append(label)
    unlabeled_list.append(unlabeled)
 
print('number of training sets: ',len(training_df))

# matches:  399
# matches:  508
# matches:  703
# matches:  4106
# matches:  616
# matches:  373
# matches:  358
# matches:  386
# matches:  481
# matches:  398
# matches:  361
# matches:  393
# matches:  326
# matches:  260
# matches:  601
# matches:  333
# matches:  473
number of training sets:  17


In [639]:
### Combine all training data ###

#training_df.append(training_data)
#labels.append(training_labels)

training_data_new = pd.concat(training_df)

training_data_new.fillna({'connections':0, 'company_1':'NaN', 'company_2':'NaN',
                    'company_3':'NaN', 'company_4':'NaN', 'Summary':'','Skills':'','Job_Description_1':'',
                    'First Name':'','Surname':'','State':'','title_1':'',
                    'title_2':'', 'Surname':'', 'City':'', 'Country':''}, inplace=True)

print('training data: ', training_data_new.shape)

training_labels_new = np.concatenate(labels)
print('training data labels: ', training_labels_new.shape)

X_unlabeled = pd.concat(unlabeled_list)
print('unlabeled data: ', X_unlabeled.shape)

training data:  (55375, 17)
training data labels:  (55375,)
unlabeled data:  (3119785, 17)


In [640]:
#Remove invalid labels 

training_labels_new[training_data_new['company_1']=='NaN'] = 0
training_labels_new[training_data_new['company_1']=='Retired'] = 0
training_labels_new[training_data_new['company_1']=='Self-employed'] = 0
training_labels_new[training_data_new['company_1']=='USAA'] = 0
training_labels_new[training_data_new['company_1']=='Wells Fargo'] = 0

countries = ['United States']
training_labels_new[~training_data_new['Country'].isin(countries)] = 0

keywords = ['fitness','gym', 'health', 'therapeutic','finance']
mask = training_data_new['company_1'].str.lower().str.contains('|'.join(keywords))
training_labels_new[mask] = 0

In [None]:
#Write out training data
'''
df_out = training_data_new
df_out['Label'] = training_labels_new
df_out.to_csv('training data.csv', index=False)



#write out all data
training_files = glob.glob('datasets/*.csv')
print(training_files)

files = []
for filename in training_files:
    data = pd.read_csv(filename)
    files.append(data)
    
filename_out = 'Combined__datasets.csv'
combined_data = pd.concat(files, ignore_index=True)
combined_data.to_csv(filename_out, index=False)
'''

### 3. Train Classifier Models

In [491]:
#create training splits for model

#random_state = 0

check_jobtitle(training_data_new)
check_company(training_data_new)
check_skills(training_data_new)
check_summary(training_data_new)
check_desc(training_data_new)
check_location(training_data_new)
check_name(training_data_new)

training_data_final = training_data_new[['job_index', 'company_index','skills_index','summary_index','location_index'
                                    ,'name_index','desc_index']]

X_train, X_test, y_train, y_test = train_test_split(training_data_final, training_labels_new, test_size=0.1, 
                                                    random_state=random_state)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Job title - number of pos matches:  864
Company - Number of matches:  2616
Skills - Number of matches:  179
Summary - Number of matches:  5927
Description - Number of matches:  359
Location - Number of matches:  13341
Names - Number of matches:  16508
(19935, 7)
(19935,)
(2215, 7)
(2215,)


In [None]:
#check for collinearity

df = training_data_final
df['Target'] = training_labels_new

# Calculate correlation matrix
correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)

training_data_final = training_data_final.drop(columns=['Target'])
scaler = StandardScaler()
scaled_X = scaler.fit_transform(training_data_final)

# Calculate VIF
vif = pd.DataFrame()
vif["Variable"] = training_data_final.columns
vif["VIF"] = [variance_inflation_factor(scaled_X, i) for i in range(scaled_X.shape[1])]

print("\nVariance Inflation Factor (VIF):")
print(vif)


pca = PCA()
pca.fit(scaled_X)
explained_variance_ratio = pca.explained_variance_ratio_

print("\nExplained Variance Ratio:")
for i, ratio in enumerate(explained_variance_ratio):
    print(f"Principal Component {i+1}: {ratio:.4f}")

variable_names = training_data_final.columns

loadings = pca.components_

# Display loadings for each principal component
print("\nPrincipal Component Loadings:")
for i, loading in enumerate(loadings):
    print(f"Principal Component {i+1}:")
    for j, weight in enumerate(loading):
        print(f"{variable_names[j]}: {weight:.4f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=random_state, 
                              max_features='sqrt', min_samples_leaf=1, min_samples_split=2)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

print('rfc Accuracy: ',np.mean(y_pred == y_test))
print("Recall Score: ", recall_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))

In [None]:
# Calculate feature importances
feature_importances = rfc.feature_importances_

print("Feature Importances:")
for feature, importance in zip(X_train.columns, feature_importances):
    print(feature, ":", importance)


In [None]:
## Train and test a random forest classifier

random_forest = RandomForestClassifier(n_estimators=100,random_state=random_state)
cv_parameters = {'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2']}

random_forest = fit_classification(random_forest, X_train,y_train, X_test, y_test,
                          cv_parameters, model_name = 'Random Forest',output_to_file = False)

In [None]:
## Train and test a multi-layer perceptron classifier

mlp = MLPClassifier(random_state=random_state)
cv_parameters = {'hidden_layer_sizes': (100,500,(500,500)),
                'activation':['identity','logistic','tanh','relu'],
                 'alpha': [0.0001, 0.001, 0.01],
                 'solver': ['lbfgs','sgd','adam']}

mlp = fit_classification(mlp, X_train,y_train, X_test, y_test, cv_parameters, 'Multi-Layer Perceptron',
                       output_to_file = False)

In [None]:
## Train and test a Gradient Boosting Classifier

GradientBoostingClassifier = GradientBoostingClassifier(random_state=random_state)
cv_parameters = {
    'n_estimators': [100, 500],  
    'learning_rate': [0.01,0.1, 0.5, 1.0],  
    'max_depth': [3, 5, 10],
    'max_features': [1.0, 'sqrt'],
    'min_samples_split': [0.1,0.25,0.5,1.0]
}

GBC = fit_classification(GradientBoostingClassifier, X_train,y_train, X_test, y_test, cv_parameters, 
                         'GradientBoostingClassifier', output_to_file = False)

### 4. Train CatBoost Model

In [641]:
### Catboost Training Data ###

check_jobtitle(training_data_new)
check_company(training_data_new)
check_location(training_data_new)
check_skills(training_data_new)
check_summary(training_data_new)
check_desc(training_data_new)
#check_name(training_data_new)

catboost_columns = ['job_index', 'skills_index','summary_index','location_index','company_index'
                    ,'desc_index', 'title_1', 'company_1','State','Country']

training_data_final2 = training_data_new[catboost_columns]

#'First Name','City','title_2', 'company_2'

X_train, X_test, y_train, y_test = train_test_split(training_data_final2, training_labels_new, test_size=0.2, 
                                                    random_state=random_state)


print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

categorical_features_indices = np.where(X_train.dtypes != int)[0]
categorical_features_indices
y_train = pd.to_numeric(y_train).astype('int32')
y_test = pd.to_numeric(y_test).astype('int32')

Job title - number of pos matches:  1349
Company - Number of matches:  3515
Location - Number of matches:  31358
Skills - Number of matches:  180
Summary - Number of matches:  6221
Description - Number of matches:  375
(44300, 10)
(44300,)
(11075, 10)
(11075,)


In [642]:
#clean unlabeled data

check_jobtitle(X_unlabeled)
check_company(X_unlabeled)
check_location(X_unlabeled)
check_skills(X_unlabeled)
check_summary(X_unlabeled)
check_desc(X_unlabeled)

X_unlabeled.fillna({'connections':0, 'company_1':'NaN', 'company_2':'NaN',
                    'company_3':'NaN', 'company_4':'NaN', 'Summary':'','Skills':'','Job_Description_1':'',
                    'First Name':'','Surname':'','State':'','title_1':'',
                    'title_2':'', 'Surname':'', 'City':'', 'Country':''}, inplace=True)

X_unlabeled = X_unlabeled[catboost_columns]

#categorical_features_indices_unlabeled = np.where(X_unlabeled.dtypes != int)[0]

Job title - number of pos matches:  44965
Company - Number of matches:  78876
Location - Number of matches:  1667746
Skills - Number of matches:  0
Summary - Number of matches:  0
Description - Number of matches:  0


In [643]:
from catboost import CatBoostClassifier, Pool, metrics, cv

class_weights = [1,1.5]
#class_weights = [1,1]

'''
CatBoost = CatBoostClassifier(
    iterations=1000,
    random_seed=random_state,
    learning_rate=0.1,
    verbose=False,
    class_weights=class_weights,
    eval_metric='Precision',
    custom_loss=['Precision'])
    '''

CatBoost = CatBoostClassifier(
    iterations=1000,
    random_seed=random_state,
    learning_rate=0.1,
    verbose=False,
    class_weights=class_weights,
    eval_metric='Recall',
    custom_loss=['Recall'])

    
# Train the model on labeled data
CatBoost.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_test, y_test),
    )

cv_params = CatBoost.get_params()
cv_params.update({
        'loss_function': metrics.Logloss()
    })
cv_data = cv(
        Pool(X_train, y_train, cat_features=categorical_features_indices),
        cv_params
    )

#print(CatBoost.get_feature_importance(prettified=True))

# Generate pseudo-labels for unlabeled data using the model
pseudo_labels = CatBoost.predict(X_unlabeled)


# Combine labeled and unlabeled data with their respective pseudo-labels
X_combined = np.concatenate((training_data_final2, X_subset), axis=0)
y_combined = np.concatenate((training_labels_new, pseudo_labels), axis=0)

# Retrain the model on the combined dataset
#CatBoost.fit(X_combined, y_combined, cat_features=categorical_features_indices,verbose=False)
    
CatBoost.fit(
    X_combined, y_combined,
    cat_features=categorical_features_indices,
    eval_set=(X_test, y_test),
    )

cv_params = CatBoost.get_params()
cv_params.update({
        'loss_function': metrics.Logloss()
    })
cv_data = cv(
        Pool(X_combined, y_combined, cat_features=categorical_features_indices),
        cv_params
    )
    

# Evaluate the performance on the validation set
accuracy = CatBoost.score(X_test, y_test)
print("Accuracy on validation set: ", accuracy)
    
y_pred_val = CatBoost.predict(X_test)
precision = precision_score(y_test, y_pred_val)
recall = recall_score(y_test, y_pred_val)
print("Precision: ", precision, "Recall: ", recall)
    

Training on fold [0/3]

bestTest = 0.7066234701
bestIteration = 236

Training on fold [1/3]

bestTest = 0.6935541952
bestIteration = 132

Training on fold [2/3]

bestTest = 0.7011163126
bestIteration = 242



CatBoostError: Length of label=3175160 and length of data=3124450 is different.

In [None]:
from catboost import CatBoostClassifier, Pool, metrics, cv

class_weights = [1,1.5]


CatBoost = CatBoostClassifier(
    iterations=100,
    random_seed=random_state,
    learning_rate=0.1,
    verbose=False,
    class_weights=class_weights,eval_metric='Recall',
    custom_loss=['Recall'])

num_iterations = 5
best_precision = 0.0
best_recall = 0.0
portion = 0.25

for i in range(num_iterations):
    
    # Calculate the number of unlabeled samples to use based on the portion
    num_samples = int(portion * (i + 1) * len(X_unlabeled))
    X_subset = X_unlabeled[:num_samples]
    
    CatBoost.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_test, y_test),
    )
    cv_params = CatBoost.get_params()
    cv_params.update({
        'loss_function': metrics.Logloss()
    })
    cv_data = cv(
        Pool(X_train, y_train, cat_features=categorical_features_indices),
        cv_params
    )


    # Generate pseudo-labels for unlabeled data using the model
    
    pseudo_labels = CatBoost.predict(X_subset)

    # Combine labeled and unlabeled data with their respective pseudo-labels
    X_combined = np.concatenate((training_data_final2, X_subset), axis=0)
    y_combined = np.concatenate((training_labels_new, pseudo_labels), axis=0)

    # Retrain the model on the combined dataset
    CatBoost.fit(X_combined, y_combined, cat_features=categorical_features_indices,verbose=False)

    # Evaluate the performance on the validation set
    accuracy = CatBoost.score(X_test, y_test)
    print("Iteration", i+1, "- Accuracy on validation set:", accuracy)
    
    y_pred_val = CatBoost.predict(X_test)
    precision = precision_score(y_test, y_pred_val)
    recall = recall_score(y_test, y_pred_val)
    print("Iteration", i+1, "- Precision:", precision, "- Recall:", recall)
    
    # Check for improvement in precision and recall
    if precision > best_precision and recall > best_recall:
        best_precision = precision
        best_recall = recall
    '''    
    else:
        # If precision and recall do not improve, break the loop
        print("No improvement in precision and recall. Stopping the iterations.")
        break
    '''

# Print the best precision and recall achieved
print("Best Precision:", best_precision, "- Best Recall:", best_recall)

In [None]:
calibrated_probs = CatBoost.predict_proba(X_test)[:, 1]

# Plot the calibration curve
true_probs, predicted_probs = calibration_curve(y_test, calibrated_probs, n_bins=10)
plt.plot(predicted_probs, true_probs, marker='o', linewidth=1, label='Calibration Curve')
plt.plot([0, 1], [0, 1], linestyle='--', color='r', label='Ideal Calibration')
plt.xlabel('Predicted probability')
plt.ylabel('True probability')
plt.legend()
plt.show()

In [None]:
#check performance
predictions = CatBoost.predict(X_test)

y_true = y_test.copy()
target_names = ['Non TS', 'TS']

print(classification_report(y_true, predictions, target_names=target_names))

In [None]:
final_predictions = CatBoost.predict(X_combined)

In [None]:
total = len(final_predictions)
print(calc)

calc = np.sum(final_predictions==1)
print(calc/total)

### 5. Make predictions on full dataset

In [None]:
# input .csv file and outputs TS predictions in the 'predictions' folder

def predictions(filename):
    
    testing_data = pd.read_csv('updated_datasets/'+filename)
    test_dataset = testing_data
    print('Filename: ',filename)
    check_jobtitle(test_dataset)
    check_company(test_dataset)
    check_skills(test_dataset)
    check_summary(test_dataset)
    check_desc(test_dataset)
    check_location(test_dataset)
    check_name(test_dataset)
    print('Results:')
    
    test_dataset.fillna({'connections':0, 'company_1':'', 'company_2': '',
                    'company_3':'', 'company_4':'NA', 'Summary':'','Skills':'','Job_Description_1':'',
                    'First Name':'','Surname':'','State':'','title_1':'',
                    'title_2':'', 'Surname':'', 'City':'', 'Country':''}, inplace=True)

    test_dataset_final = test_dataset[['job_index', 'company_index','skills_index','summary_index','location_index'
                                    ,'name_index','desc_index']]
    
    
    #pred_ts = rfc.predict(test_dataset_final)
    #pred_ts = best_model.predict(test_dataset_final)
    #pred_ts = mlp['model'].predict(test_dataset_final)
    #pred_ts = random_forest['model'].predict(test_dataset_final)
    #pred_ts = GBC['model'].predict(test_dataset_final)
    
    ### For CatBoost Classifiers ###
    test_dataset_final2 = test_dataset[catboost_columns]
    
    pred_ts = CatBoost.predict(test_dataset_final2)
    proba = CatBoost.predict_proba(test_dataset_final2)
    

    
    print('number of predicted TS:', np.sum(pred_ts == 1))
    rate = np.sum(pred_ts == 1)/len(pred_ts)
    print('percentage of predicted TS:', rate)
    print()

    # write out predictions to csv
    output = test_dataset.loc[pred_ts == 1]
    proba_df = pd.DataFrame(proba)
    output_proba = proba_df.loc[pred_ts == 1, 1]
    output['confidence'] = output_proba

    output_columns = ['Linkedin_url','Summary', 'Skills', 'title_1',
           'company_1', 'time_duration_1', 'Job_Description_1',
           'First Name','Middle Name', 'Surname', 'City', 'State', 'Country',
           'confidence']

    output = output[output_columns]
    filename_out = 'predictions/'+filename[:-4]+'__predictions.csv'
    output.to_csv(filename_out, index=False)
    
    return rate, output

In [629]:
dataset_files = glob.glob('updated_datasets/*.csv')
dataset_files = [os.path.basename(file) for file in dataset_files]
print(dataset_files)

['updated_lifull_12.2022.csv', 'updated_lifull_419M3.csv', 'updated_lifull_419_M2.csv', 'updated_more_training_data.csv', 'updated_lifull_419M1.csv', 'updated_lifull_bing2_212.csv', 'updated_lifull_322_0.csv', 'updated_li_full_11_22.csv', 'updated_lifull_303.csv', 'updated_lifull_326-001.csv', 'updated_lifull_bing1_212.csv', 'updated_lifull_326-000.csv', 'updated_lifull_326-002.csv', 'updated_lifull_reg_212.csv', 'updated_lifull_1_2023.csv', 'updated_lifull_309_1.csv', 'updated_lifull_309_2_3.csv']


In [None]:
output_df = []
total_rate = []
for idx, file in enumerate(dataset_files):
    print(idx,' out of ',len(dataset_files)-1)
    rate, output = predictions(file)
    output_df.append(output)
    total_rate.append(rate)

# combine all predictions into one file    
filename_out = 'predictions/Combined__predictions.csv'
combined_df = pd.concat(output_df, ignore_index=True)
print(combined_df.shape)
print(sum(total_rate)/len(total_rate))
#combined_df.to_csv(filename_out, index=False)  

In [None]:
# Averaged Frequency results across all datasets

freq_sum = pd.Series()
count = 0

# Accumulate frequencies and associated companies
for output in output_df:
    freq = output['company_1'].value_counts().head(25)
    freq_sum = freq_sum.add(freq, fill_value=0)
    count += 1

sorted_freq = freq_sum.sort_values(ascending=False)
average_freq = sorted_freq / (count)
associated_company = average_freq.idxmax()

print("Average Frequency:")
print(average_freq[:25])

### CatBoost ###

In [634]:
output = output_df[0]
freq = output['company_1'].value_counts()
print(freq[:30])
print(len(output))

Booz Allen Hamilton                        142
Northrop Grumman                            94
L3Harris Technologies                       43
United States Air Force                     39
CACI International Inc                      28
Raytheon Technologies                       26
Epic                                        24
SAIC                                        18
Leidos                                      18
Raytheon Intelligence & Space               17
Lockheed Martin                             17
Amazon Web Services (AWS)                   15
Microsoft                                   14
General Dynamics Information Technology     13
Peraton                                     12
ManTech                                     12
C3 AI                                       11
MITRE                                       11
Ball Aerospace                              11
PayPal                                      10
Qualcomm                                    10
Deloitte     

In [None]:
original_freq = output[0,'company_1'].value_counts()

#print(original_freq)

print(freq['Lockheed Martin']/original_freq['Lockheed Martin'])

#print(freq['Microsoft']/original_freq['Microsoft'])
#freq['Amazon']