In [25]:
import pandas as pd
import seaborn # this changes matplotlib defaults to make the graphs look cooler!
import pickle 
from sklearn.cross_validation import train_test_split
from sklearn import svm
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report, roc_curve, precision_recall_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, scale
import matplotlib.pyplot as plt
import numpy as np

# load contracts data
contracts = pd.read_csv('../Data/Procurements/awards_with_features.csv', index_col=0, low_memory=False)

# load network features
network_country = pd.read_csv('../Data/Procurements/Historic_and_Major_awards_network_features_Country2.csv', index_col=0, low_memory=False)
network_global = pd.read_csv('../Data/Procurements/Historic_and_Major_awards_network_features.csv', index_col=0, low_memory=False)

# kick out contracts with no supplier to match
contracts = contracts[contracts['canonical_name'].notnull()]

# load investigations
investigations = pd.read_csv('../Data/Investigations/investigations.csv', index_col=0, low_memory=False)
investigations['guilty_or_unknown'] = \
  np.logical_and(  investigations.outcome_of_overall_investigation_when_closed.notnull(), \
                investigations.outcome_of_overall_investigation_when_closed != 'Unfounded') 

# this should assign True = Investigated, False = Otherwise

# group by canonical_name and country to remove duplicates
def reduce_to_one(my_list):
    return my_list.unique()

aggregations = {
    'guilty_or_unknown':'sum',
    'unique_id': reduce_to_one
}
investigations = investigations.groupby(by=['canonical_name', 'country'], as_index=False).agg(aggregations)

# drop investigations that where outcome of overall investigation is Unfounded or missing
investigations = investigations[investigations['guilty_or_unknown'] > 0]

In [26]:
net_global_features = network_global.columns[78:].tolist()
net_global_features.extend(['unique_id'])
network_global = network_global[net_global_features]

net_country_features = network_country.columns[4:].tolist()
network_country = network_country[net_country_features]

In [44]:
# create full data set
df = pd.merge(left=contracts,
                   right=investigations,  
                   left_on=['canonical_name', 'buyer_country'], #, 'fiscal_year'],
                   right_on=['canonical_name', 'country'], #, 'fy_complaint_opened'],
                   how='left') # this makes sure that we keep all procuremenets, whether they have a matching investigation or not

df.rename(columns={'unique_id_x':'unique_id_contracts', 'unique_id_y':'unique_id_invests'}, inplace=True)

df = df.merge(right=network_global,
              left_on='unique_id_contracts',
              # left_on='unique_id',
              right_on='unique_id')

df = df.merge(right=network_country,
              left_on='unique_id_contracts',
              right_on='unique_id')
del df['unique_id_x']
del df['unique_id_y']
del df['country']
df['overlap'] = df['unique_id_invests'].notnull()

In [50]:
features = df.columns.tolist()

remove_list = [
    'buyer',
    #'buyer_country',
    #'project_id',
    #'unique_id_contracts',
    'unique_id_invests',
    #'major_sector_clean',
    #'canonical_name',
    'guilty_or_unknown',
    'Supplier_Average_Distance_Investigated_Suppliers_Contemporary_Global',
    'Project_Average_Distance_Investigated_Suppliers_Contemporary_Global',
    'Supplier_Average_Distance_Investigated_Suppliers_Cumulative_Global',
    'Project_Average_Distance_Investigated_Suppliers_Cumulative_Global',
    'Supplier_Average_Distance_Investigated_Projects_Contemporary_Global',
    'Project_Average_Distance_Investigated_Projects_Contemporary_Global',
    'Supplier_Average_Distance_Investigated_Projects_Cumulative_Global',
    'Project_Average_Distance_Investigated_Projects_Cumulative_Global',
    'Supplier_Average_Distance_Investigated_Suppliers_Contemporary_Country',
    'Project_Average_Distance_Investigated_Suppliers_Contemporary_Country',
    'Supplier_Average_Distance_Investigated_Suppliers_Cumulative_Country',
    'Project_Average_Distance_Investigated_Suppliers_Cumulative_Country',
    'Supplier_Average_Distance_Investigated_Projects_Contemporary_Country',
    'Project_Average_Distance_Investigated_Projects_Contemporary_Country',
    'Supplier_Average_Distance_Investigated_Projects_Cumulative_Country',
    'Project_Average_Distance_Investigated_Projects_Cumulative_Country',
    'Supplier_Degree_Centrality_Contemporary_Country',
    'Supplier_Degree_Centrality_Cumulative_Country',
    'Supplier_Degree_Centrality_Contemporary_Global',
    'Supplier_Degree_Centrality_Cumulative_Global'
]
for feature in remove_list:
    features.remove(feature)

In [51]:
#remove project-supplier and supplier-project features
remove_list = []
for col in features:
    if 'Project' in col and 'Supplier' in col:
       remove_list.append(col)

for feature in remove_list: 
    features.remove(feature)

#remove all "distance to investigated" features
remove_list = []
for col in features:
    if 'Investigated' in col:
       remove_list.append(col)

for feature in remove_list: 
    features.remove(feature)

df2 = df[features]


# Select features
features = df2.columns.tolist()
remove_list = [
    'buyer_country',
    'project_id',
    'unique_id_contracts',
    'major_sector_clean',
    'canonical_name'
]

for feature in remove_list: 
    features.remove(feature)

zero_nan_cols = [
    'Supplier_Neighbor_Intensity_Contemporary_Global',
    'Project_Neighbor_Intensity_Contemporary_Global',
    'Supplier_Neighbor_Intensity_Cumulative_Global',
    'Project_Neighbor_Intensity_Cumulative_Global',
    'Supplier_Neighbor_Intensity_Contemporary_Country',
    'Project_Neighbor_Intensity_Contemporary_Country',
    'Supplier_Neighbor_Intensity_Cumulative_Country',
    'Project_Neighbor_Intensity_Cumulative_Country'
]

df2[zero_nan_cols] = df2[zero_nan_cols].fillna(value=0)
print df2.columns

Index([u'award_amount_usd', u'buyer_country', u'competitive', u'fiscal_year', u'number_of_bids', u'project_id', u'unique_id_contracts', u'major_sector_clean', u'canonical_name', u'business_disclosure_index_nearest', u'firms_competing_against_informal_firms_perc_nearest', u'payments_to_public_officials_perc_nearest', u'do_not_report_all_sales_perc_nearest', u'legal_rights_index_nearest', u'time_to_enforce_contract_nearest', u'bribes_to_tax_officials_perc_nearest', u'property_rights_rule_governance_rating_nearest', u'transparency_accountability_corruption_rating_nearest', u'gdp_per_capita_nearest', u'primary_school_graduation_perc_nearest', u'gini_index_nearest', u'unemployment_perc_nearest', u'gdp_per_capita_perc_change_1', u'gdp_per_capita_perc_change_5', u'gdp_per_capita_mean', u'number_of_sectors_CY', u'number_of_suppliers_CY', u'perc_competitive_CY', u'number_of_buyers_CY', u'number_of_bids_CY', u'number_of_projects_CY', u'number_of_contracts_CY', u'award_amount_usd_CY', u'number_of

In [75]:
mean_nan_cols = df2[features].columns.tolist()
for col in mean_nan_cols:
    mean = df2[col].mean()
    df2[col] = df2[col].fillna(value=mean)
    
#for col in df2[features].columns:
#    df2[col] = df2[col].astype()

for col in df2[features].columns:
    df2[col][np.isinf(df2[col])] = -1
    df2[col].replace(-1, max(df2[col])*1.1, inplace=True)

NameError: name 'float64' is not defined

In [55]:
df=df2

In [73]:
from random import sample
random_inds = sample(df[~(df['overlap'].astype(bool))].index.values, 4000)
df_nonoverlap_random = df[features].ix[random_inds]
df_overlap = df[features][df['overlap'].astype(bool)]
full_data = df_overlap.append(df_nonoverlap_random, ignore_index=True)

# Sort by year
full_data = full_data.sort('fiscal_year')
del full_data['fiscal_year']
labels = full_data['overlap']
del full_data['overlap']

for col in full_data.columns:
    full_data[col][ np.isinf(full_data[col])] = -1
    full_data[col].replace(-1, max(full_data[col])*1.1, inplace=True)

# df_overlap.shape
X_train, X_test = full_data[:-2000].as_matrix(), full_data[-2000:].as_matrix()
y_train, y_test = labels[:-2000].as_matrix(), labels[-2000:].as_matrix()




In [74]:
# #Random Forest
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = [{
#     'max_features':[None, 'auto'],
#     'max_depth':[None, sqrt(len(features))],
    'n_estimators': [100] # of these, the more the better
}] 

scores = ['roc_auc']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print('---------------')
    rf_clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring=score) # cross-validation takes a while!!!
    rf_clf.fit(X_train, y_train)
    print("Best parameters set found on training set:")
    print(rf_clf.best_estimator_)
    print('---')
    
    print("Grid scores on development set:")
    for params, mean_score, scores in rf_clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print('---')

    print("Detailed classification report:")
    print(classification_report(y_test, rf_clf.predict(X_test)))


y_pred = rf_clf.predict(X_test)

pd.Series(y_pred).value_counts()


for ind in argsort(rf_clf.best_estimator_.feature_importances_)[::-1]:
    print '%0.2f' % rf_clf.best_estimator_.feature_importances_[ind], features[ind]


feature_importances = []
for ind in argsort(rf_clf.best_estimator_.feature_importances_)[::-1]:
    feature_importances.append([features[ind], rf_clf.best_estimator_.feature_importances_[ind]])
feature_importances = pd.DataFrame(feature_importances)
feature_importances.columns = ['feature','importance']

percent_missing = pd.DataFrame(percent_missing)
percent_missing.columns = ['feature', 'percent_missing']

feature_df = pd.merge(feature_importances, percent_missing, on='feature', how='left')


scatter(feature_df['importance'], feature_df['percent_missing'])


feature_importances.shape


# Generate ROC curve
# Compute ROC curve and area under the curve
rf_probs = rf_clf.best_estimator_.predict_proba(X_test)
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, rf_probs[:, 1])
rf_roc_auc = auc(rf_fpr, rf_tpr)

# Compute ROC curve and area the curve
# svm_probs = svm.best_estimator_.predict_proba(X_train)
# svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_train, svm_probs[:, 1])
# svm_roc_auc = auc(svm_fpr, svm_tpr)
# rf_probs = rf.best_estimator_.predict_proba(X_train)
# rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_train, rf_probs[:, 1])
# rf_roc_auc = auc(rf_fpr, rf_tpr)

# Plot ROC curve
plt.clf()
plt.plot(rf_fpr, rf_tpr, label='Random Forest (auc = %0.2f)' % rf_roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('% Uninvestigated Contracts Marked as "Should be investigated"')
plt.ylabel('% Investigated Contracts Caught by Model')
plt.title('True Positive Rate vs. False Positive Rate')
plt.legend(loc="lower right")
plt.show()


# Generate precision-recall curve
rf_probs = rf_clf.best_estimator_.predict_proba(X_test)
rf_precision, rf_recall, rf_thresholds = precision_recall_curve(y_test, rf_probs[:, 1])
#rf_PR_auc = auc(rf_precision, rf_recall, reorder=True)

# Compute ROC curve and area the curve
# svm_probs = svm.best_estimator_.predict_proba(X_train)
# svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_train, svm_probs[:, 1])
# svm_roc_auc = auc(svm_fpr, svm_tpr)
# rf_probs = rf.best_estimator_.predict_proba(X_train)
# rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_train, rf_probs[:, 1])
# rf_roc_auc = auc(rf_fpr, rf_tpr)

# Plot precision_recall curve
plt.clf()
plt.plot( rf_recall, rf_precision, label='Random Forest')
plt.plot([1, 0], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.1])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.show()



# Tuning hyper-parameters for roc_auc
---------------


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').