# The Spies Among US

### Which citizens should be placed under close surveillance?

__________________________

In [240]:
# Setup
import pandas as pd
import numpy as np
import pandas_profiling as pp
import seaborn as sns
from datetime import datetime
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
'''from model_library import baseline
from model_library import xgboost
from model_library import svc
from model_library import naive_bayes
from model_library import knn
from model_library import logistic_regression'''
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import label_binarize
import joblib
import warnings
warnings.filterwarnings("ignore")

In [241]:
# Import data
df_train = pd.read_csv('espionage_data.csv')
df_test = pd.read_csv('espionage_data_test.csv')

In [242]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       8000 non-null   int64  
 1   ID_ORIGINAL              8000 non-null   int64  
 2   Gender                   8000 non-null   object 
 3   Foreign_Citizenship      7862 non-null   object 
 4   Age                      8000 non-null   int64  
 5   Frequent_Traveler        7923 non-null   object 
 6   Cellphone_Usage          8000 non-null   object 
 7   Household_Size           7670 non-null   float64
 8   Spy                      8000 non-null   int64  
 9   Satisfaction_Level       7670 non-null   float64
 10  Occupation               7876 non-null   object 
 11  Political_Participation  7876 non-null   object 
 12  Social_Person            7924 non-null   object 
 13  Area_Residence           7924 non-null   object 
 14  Military_Service        

In [243]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       493 non-null    int64  
 1   ID_ORIGINAL              493 non-null    int64  
 2   Gender                   493 non-null    object 
 3   Foreign_Citizenship      486 non-null    object 
 4   Age                      493 non-null    int64  
 5   Frequent_Traveler        490 non-null    object 
 6   Cellphone_Usage          493 non-null    object 
 7   Household_Size           476 non-null    float64
 8   Satisfaction_Level       476 non-null    float64
 9   Occupation               489 non-null    object 
 10  Political_Participation  489 non-null    object 
 11  Social_Person            485 non-null    object 
 12  Area_Residence           485 non-null    object 
 13  Military_Service         485 non-null    object 
 14  Household_Income         4

In [244]:
# Generate profiling report
#df_train['Spy'] = df_train['Spy'].astype('float64')
#df_train['Spy'] = df_train['Spy'].astype('object')
#prof_train = pp.ProfileReport(df_train)
#prof_train.to_file(output_file='output_train.html')

#prof_test = pp.ProfileReport(df_test)
#prof_test.to_file(output_file='output_test.html')

In [245]:
df_train.head()

Unnamed: 0,ID,ID_ORIGINAL,Gender,Foreign_Citizenship,Age,Frequent_Traveler,Cellphone_Usage,Household_Size,Spy,Satisfaction_Level,Occupation,Political_Participation,Social_Person,Area_Residence,Military_Service,Household_Income
0,1000,467329,Female,No,40,Yes,Low,1.0,1,4.0,Government,No involvement,No,City,Never,7000
1,1001,461212,Female,No,30,No,Low,6.0,0,1.0,Nothing,No involvement,No,City,Never,19610
2,1002,466216,Male,No,29,No,Low,4.0,0,1.0,Private company,Strong involvement,Yes,Country-side,Never,8261
3,1003,462613,Female,Yes,35,No,Average,5.0,0,1.0,Private company,No involvement,Yes,Country-side,Never,7000
4,1004,465709,Male,Yes,68,Yes,Average,2.0,1,4.0,Private company,Strong involvement,No,City,Never,8261


In [246]:
df_test.head()

Unnamed: 0,ID,ID_ORIGINAL,Gender,Foreign_Citizenship,Age,Frequent_Traveler,Cellphone_Usage,Household_Size,Satisfaction_Level,Occupation,Political_Participation,Social_Person,Area_Residence,Military_Service,Household_Income
0,9000,460139,Male,No,26,No,Low,6.0,1.0,Private company,Unknown,Yes,City,Intervention in Libya,7566.0
1,9001,460648,Female,,29,No,Low,1.0,3.0,Nothing,No involvement,Yes,City,Intervention in Libya,10088.0
2,9002,460835,Female,Yes,28,Yes,Low,1.0,3.0,Private company,Unknown,Yes,City,Intervention in Libya,0.0
3,9003,461613,Female,No,25,Yes,Low,4.0,1.0,Private company,Some involvement,Yes,City,Intervention in Libya,10088.0
4,9004,461721,Male,No,26,Yes,Low,5.0,1.0,Private company,Strong involvement,Yes,City,Intervention in Libya,8827.0


## Exploration & Understanding

In [247]:
# Check for duplicates
print(df_train[df_train.duplicated()])
print(df_test[df_test.duplicated()])

Empty DataFrame
Columns: [ID, ID_ORIGINAL, Gender, Foreign_Citizenship, Age, Frequent_Traveler, Cellphone_Usage, Household_Size, Spy, Satisfaction_Level, Occupation, Political_Participation, Social_Person, Area_Residence, Military_Service, Household_Income]
Index: []
Empty DataFrame
Columns: [ID, ID_ORIGINAL, Gender, Foreign_Citizenship, Age, Frequent_Traveler, Cellphone_Usage, Household_Size, Satisfaction_Level, Occupation, Political_Participation, Social_Person, Area_Residence, Military_Service, Household_Income]
Index: []


In [248]:
# Check for missing values
train_null = df_train.isna().sum()/len(df_train)*100
print('--- TRAIN ---\n',train_null[train_null > 0])

test_null = df_test.isna().sum()/len(df_test)*100
print('\n--- TEST ---\n',test_null[test_null > 0])

train_nulls = train_null[train_null > 0].index.tolist()
test_nulls = test_null[test_null > 0].index.tolist()

--- TRAIN ---
 Foreign_Citizenship        1.7250
Frequent_Traveler          0.9625
Household_Size             4.1250
Satisfaction_Level         4.1250
Occupation                 1.5500
Political_Participation    1.5500
Social_Person              0.9500
Area_Residence             0.9500
Military_Service           0.9500
dtype: float64

--- TEST ---
 Foreign_Citizenship        1.419878
Frequent_Traveler          0.608519
Household_Size             3.448276
Satisfaction_Level         3.448276
Occupation                 0.811359
Political_Participation    0.811359
Social_Person              1.622718
Area_Residence             1.622718
Military_Service           1.622718
Household_Income           7.302231
dtype: float64


## Pre-processing

In [249]:
# Label encoding
def encode_features(df,s):
    le = preprocessing.LabelEncoder()
    le.fit(df[s])
    df['CD_'+s] = le.transform(df[s])
    return le

# Pre-process data
def prepare_data(df):
    df_prep = df.copy()
        
    ## MISSING VALUES ##
    # Fill missing values
    df_prep['Area_Residence'] = df_prep['Area_Residence'].fillna('Unknown')
    df_prep['Foreign_Citizenship'] = df_prep['Foreign_Citizenship'].fillna('Unknown')
    df_prep['Frequent_Traveler'] = df_prep['Frequent_Traveler'].fillna('Unknown')    
    df_prep['Occupation'] = df_prep['Occupation'].fillna('Unknown')
    df_prep['Political_Participation'] = df_prep['Political_Participation'].fillna('Unknown')    
    df_prep['Military_Service'] = df_prep['Military_Service'].fillna('Unknown')
    df_prep['Social_Person'] = df_prep['Social_Person'].fillna('Unknown')
    
    # TO DO: Check if this is a good solution
    df_prep['Household_Size'] = df_prep['Household_Size'].fillna('-1')    
    df_prep['Satisfaction_Level'] = df_prep['Satisfaction_Level'].fillna('-1')
    df_prep['Household_Income'] = df_prep['Household_Income'].fillna('-1')
    
    # Drop columns with missing values
    #df_prep.dropna(subset=['Household_Size','Satisfaction_Level','Household_Income'],inplace=True)

    ## DATA TYPES ##
    df_prep['Household_Income'] = df_prep['Household_Income'].astype('float64')    
    df_prep['Household_Size'] = df_prep['Household_Size'].astype('float64')   
    df_prep['Satisfaction_Level'] = df_prep['Satisfaction_Level'].astype('float64')    
    df_prep['ID_ORIGINAL'] = df_prep['ID_ORIGINAL'].astype('int')  
    
    ## ENCODING ##
    enc_cols = ['Gender', 'Foreign_Citizenship','Frequent_Traveler', 'Cellphone_Usage',
                'Occupation', 'Political_Participation','Social_Person', 'Area_Residence', 'Military_Service']
    for i in enc_cols:
        encode_features(df_prep,i)
    
    ## FILTER ##
    feature_list = ['ID_ORIGINAL', 'Age','Household_Size', 'Satisfaction_Level',
       'Household_Income', 'CD_Gender', 'CD_Foreign_Citizenship',
       'CD_Frequent_Traveler', 'CD_Cellphone_Usage','CD_Occupation', 
       'CD_Political_Participation', 'CD_Social_Person','CD_Area_Residence', 'CD_Military_Service']
    
    X = df_prep[feature_list]
    
    if 'Spy' in df_prep.columns.tolist():
        y = df_prep['Spy']
        
        # Split train dataset into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
    else:
        y = []
        X_train = []
        X_test = []
        y_train = []
        y_test = []
    

    return df_prep, X, y, X_train, X_test, y_train, y_test

In [250]:
df_train_prep, X, y, X_train, X_test, y_train, y_test = prepare_data(df_train)
df_test_prep, X_val, y_, X_train_, X_test_, y_train_, y_test_ = prepare_data(df_test)

## Modelling

In [251]:
# Build class to store models and parameters
import xgboost as xgb
from sklearn.dummy import DummyClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from skopt.space import Real, Categorical, Integer

class classification_models:
    def __init__(self):
        self.baseline = self.baseline()
        self.svc = self.svc()
        self.naive_bayes = self.naive_bayes()
        self.knn = self.knn()
        self.logistic_regression = self.logistic_regression()
        self.xgboost = self.xgboost()


class baseline:
    model = DummyClassifier()

    parameters = {
                   'model__strategy': ['uniform'],
                   'model__random_state': [8]
                 }

    @staticmethod
    def best_model(params):
        clf = DummyClassifier(**params)
        return clf


class svc:
    model = svm.SVC()

    parameters = {'model__C': [0.1, 1, 10, 100],
                  'model__gamma': [1, 0.1, 0.01, 0.001],
                  'model__kernel': ['rbf', 'poly', 'sigmoid']}

    @staticmethod
    def best_model(params):
        clf = svm.SVC(**params)
        return clf


class naive_bayes:
    model = GaussianNB()

    parameters = {
                'model__var_smoothing': [1e-2, 1e-4, 1e-6,1e-8,1e-10,1e-12,1e-14,1e-16]
                 }

    @staticmethod
    def best_model(params):
        clf = GaussianNB(**params)
        return clf

class knn:
    model = KNeighborsClassifier()

    parameters = {'model__n_neighbors': Integer(3, 20),
                  'model__weights': Categorical(categories=['uniform', 'distance']),
                  'model__algorithm': Categorical(categories=['auto', 'ball_tree', 'kd_tree', 'brute']),
                  'model__leaf_size': Integer(5, 50)
                  }

    @staticmethod
    def best_model(params):
        clf = KNeighborsClassifier()
        return clf

class logistic_regression:
    model = LogisticRegression()

    parameters = {
                   'model__penalty': ['l1', 'l2', 'elasticnet', 'none'],
                   'model__C': [1e-2, 1e-4, 0.01, 0.1,0.3, 0.5,0.8, 1],
                   'model__random_state': [8]
                 }

    @staticmethod
    def best_model(params):
        clf = LogisticRegression(**params)
        return clf

class xgboost:
    model = xgb.XGBClassifier()

    parameters = {
                   'model__learning_rate': Real(0.1,1.0,'uniform'),
                   'model__max_depth': Integer(2, 100),
                   'model__min_samples_leaf': Integer(2, 100),
                   'model__min_samples_split': Integer(2, 100),
                   'model__subsample': Real(0.1,1.0,'uniform'),
                   'model__n_estimators': Integer(10, 100),
                   'model__random_state': [16],
                 }


    @staticmethod
    def best_model(params):
        clf = xgb.XGBClassifier(**params)
        return clf


In [252]:
# Function to build models
def build_model(X,y,niter,model_list=list):
    '''model_list = [baseline, svc, naive_bayes, knn, logistic_regression, xgboost]'''
    gsearchs = []
    best_models = []
    scores = []
    best_params = []
    cv_results = []
    
    print('---------------- MODEL TRAINING ----------------')
    
    for i in model_list:
        
        string = str(i)
        model_name_ = string[string.find('.')+1:]
        model_name = model_name_[0:model_name_.find('>')-1].strip()       
        print('\nModel: ', model_name)
        
        # Get model
        model = i.model
        
        # Create pipeline
        pipe = Pipeline([('scaler', MinMaxScaler()),('model',model)])
        #pipe = Pipeline([('model',model)])
    
        # Get hyperparameter grid
        param_search = i.parameters
        
        # Stratified k-fold cross-validation
        skf = StratifiedKFold(n_splits=5)
        my_cv = [(train,test) for train, test in skf.split(X,y)]
        
        gsearch = BayesSearchCV(pipe, cv=skf, search_spaces=param_search, n_jobs=10,scoring='f1',
                                     verbose=True,refit=True,n_iter=niter)
        
        
        gsearch.fit(X, y)
        
        '''sorted_idx = gsearch.best_estimator_.named_steps["model"].feature_importances_.argsort()
        plt.barh(X.columns[sorted_idx], gsearch.best_estimator_.named_steps["model"].feature_importances_[sorted_idx])
        plt.xlabel("Xgboost Feature Importance")'''
        
        gsearchs.append(gsearch)
        best_models.append(gsearch.best_estimator_)
        scores.append(gsearch.best_score_)
        best_params.append(gsearch.best_params_)
        cv_results.append(gsearch.cv_results_)
        
        print('CV accuracy for model {0}: {1}'.format(model_name,gsearch.best_score_))
        
    return gsearchs,best_models,scores,best_params,cv_results

In [253]:
gsearchs,best_models,scores,best_params,cv_results = build_model(X_train,y_train,100,model_list=[svc, naive_bayes, knn, xgboost])


---------------- MODEL TRAINING ----------------

Model:  svc
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 

## Performance Assessment

In [None]:
# Function to evaluate models
def evaluate_model(X_test, X_train,y_train, y_test, best_models):
    
    acc_results = []
    f1_results = []
    
    print('---------------- MODEL EVALUATION ----------------')
    
    model_counter = 1
    for i in best_models:
        
        string = str(i)
        model_name_ = string[string.find(''''model',''')+len(''''model', '''):]
        model_name = model_name_[0:model_name_.find('(')].strip() 
        print('\nModel #{0}: {1}'.format(model_counter,model_name))
        
        # Predict
        y_pred = i.predict(X_test)        
        df_pred = pd.DataFrame(data=y_pred, columns=['Spy'],index=y_test.index)
        
        # Evaluation metrics
        accuracy = metrics.accuracy_score(y_test, y_pred)
        f1 = metrics.f1_score(y_test, y_pred, average=None)  
        precision = precision_score(y_test, y_pred, average=None)
        recall = recall_score(y_test, y_pred, average=None)
        
        
        # Labels to binarize
        labels = [0, 1, 2]

        # Binarize ytest with shape (n_samples, n_classes)
        ytest = label_binarize(y_test, classes=labels)

        # Binarize ypreds with shape (n_samples, n_classes)
        ypreds = label_binarize(y_pred, classes=labels)
        
        # Get roc auc result
        #roc_auc = roc_auc_score(ytest, ypreds, average=None)
        roc_auc = 0
        
        acc_results.append(accuracy)
        f1_results.append(f1)
        
        print('Accuracy: ',accuracy)
        print('Average F1 Score: ',f1.mean())
        print('F1 Score by Class: ',f1)
        print('Average Precision: ',precision.mean())
        print('Precision by Class: ',precision)
        print('Average Recall: ',recall)
        print('Recall by Class: ',recall.mean())
        print('ROC AUC score by Class: ',roc_auc)
        #print('Average ROC AUC score: ',roc_auc.mean())
        print('\nConfusion Matrix:\n',confusion_matrix(y_test, y_pred))
        print('\nClassification Report:\n',classification_report(y_test, y_pred))
        
        model_counter+=1
    
    # Get best model based on accuracy score
    maxi_model_nb = acc_results.index(max(acc_results))
    
    maxi_model = best_models[maxi_model_nb]
    string_ = str(maxi_model)
    maxi_model_name_ = string_[string_.find(''''model',''')+len(''''model', '''):]
    maxi_model_name = maxi_model_name_[0:maxi_model_name_.find('(')].strip() 
    
    
    print('\nBest Model: #{0} {1}'.format(maxi_model_nb+1,maxi_model_name))
    
    
    return acc_results,f1_results, maxi_model

In [None]:
acc_results,f1_results,maxi_model = evaluate_model(X_test, X_train, y_train, y_test, best_models)

## Submission

In [None]:
# Save best model to disk
filename = 'final_model.sav'
joblib.dump(maxi_model, filename)

In [None]:
# Predict
y_pred = maxi_model.predict(X_val)        
df_pred = pd.DataFrame(data=y_pred, columns=['Spy'],index=X_val.index)

df_submission = X_val[['ID_ORIGINAL']].merge(df_pred, how='inner', left_index=True, right_index=True, indicator=False)
df_submission = df_submission.set_index('ID_ORIGINAL')
df_submission.to_csv('submission.csv')
df_submission

In [None]:
# Score: 0.71162