## Modeling Survival on the Titanic

In [832]:
import pandas as pd
import numpy as np
import re
import pickle

In [833]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.graph_objs import *

In [834]:
from sklearn import svm
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [835]:
# Define the color palette 
Viridis=[
"#440154", "#440558", "#450a5c", "#450e60", "#451465", "#461969",
"#461d6d", "#462372", "#472775", "#472c7a", "#46307c", "#45337d",
"#433880", "#423c81", "#404184", "#3f4686", "#3d4a88", "#3c4f8a",
"#3b518b", "#39558b", "#37598c", "#365c8c", "#34608c", "#33638d",
"#31678d", "#2f6b8d", "#2d6e8e", "#2c718e", "#2b748e", "#29788e",
"#287c8e", "#277f8e", "#25848d", "#24878d", "#238b8d", "#218f8d",
"#21918d", "#22958b", "#23988a", "#239b89", "#249f87", "#25a186",
"#25a584", "#26a883", "#27ab82", "#29ae80", "#2eb17d", "#35b479",
"#3cb875", "#42bb72", "#49be6e", "#4ec16b", "#55c467", "#5cc863",
"#61c960", "#6bcc5a", "#72ce55", "#7cd04f", "#85d349", "#8dd544",
"#97d73e", "#9ed93a", "#a8db34", "#b0dd31", "#b8de30", "#c3df2e",
"#cbe02d", "#d6e22b", "#e1e329", "#eae428", "#f5e626", "#fde725"]
# source: https://bhaskarvk.github.io/colormap/reference/colormap.html

### Read in the titanic dataset

In [836]:
df = pd.read_csv('resources/pax_20_02_2018_1_CSV.csv')
# df = pd.read_csv("https://raw.githubusercontent.com/austinlasseter/plotly_dash_tutorial/master/00%20resources/titanic.csv")
print(df.shape)
df.head()

(1518, 240)


Unnamed: 0,Con,Contp,Reg,AgtId,Agt,Dat,Status,Lgt,Agtp,Stage,...,TjRep,TjRSym,TjRMa,TjNR,ImUN,ImOth,ImRef,ImPK,ImE,ImSrc
0,Afghanistan,Government,Europe and Eurasia,864,Tokyo Declaration Partnership for Self-Relianc...,08/07/2012,Multiparty signed/agreed,14,InterIntra,Imp,...,0,0,0,1,0,0,0,0,1,1
1,Afghanistan,Government,Europe and Eurasia,848,Conclusions of the Conference on Afghanistan a...,05/12/2011,Multiparty signed/agreed,7,InterIntra,Pre,...,0,0,0,1,1,0,0,0,0,1
2,Afghanistan,Government,Europe and Eurasia,849,Istanbul Process on Regional Security and Coop...,02/11/2011,Multiparty signed/agreed,10,InterIntra,Imp,...,0,0,0,1,0,0,0,0,0,1
3,Afghanistan,Government,Europe and Eurasia,709,Renewed Commitment by the Afghan Government to...,22/07/2010,Multiparty signed/agreed,9,InterIntra,Imp,...,0,0,0,1,0,0,0,1,1,1
4,Afghanistan,Government,Europe and Eurasia,708,The Resolution Adopted at the Conclusion of th...,06/06/2010,Multiparty signed/agreed,4,Intra,Pre,...,0,0,0,1,0,0,0,0,0,1


### Features

In [837]:
# Make some dummies for Imp.
df['Imp']=df['Stage'].map({'Pre ':0,'SubPar':0,'Imp':1,'Cea':0,'SubComp':0,'Ren':0, 'Oth':0})


In [838]:
df.head()

Unnamed: 0,Con,Contp,Reg,AgtId,Agt,Dat,Status,Lgt,Agtp,Stage,...,TjRSym,TjRMa,TjNR,ImUN,ImOth,ImRef,ImPK,ImE,ImSrc,Imp
0,Afghanistan,Government,Europe and Eurasia,864,Tokyo Declaration Partnership for Self-Relianc...,08/07/2012,Multiparty signed/agreed,14,InterIntra,Imp,...,0,0,1,0,0,0,0,1,1,1.0
1,Afghanistan,Government,Europe and Eurasia,848,Conclusions of the Conference on Afghanistan a...,05/12/2011,Multiparty signed/agreed,7,InterIntra,Pre,...,0,0,1,1,0,0,0,0,1,
2,Afghanistan,Government,Europe and Eurasia,849,Istanbul Process on Regional Security and Coop...,02/11/2011,Multiparty signed/agreed,10,InterIntra,Imp,...,0,0,1,0,0,0,0,0,1,1.0
3,Afghanistan,Government,Europe and Eurasia,709,Renewed Commitment by the Afghan Government to...,22/07/2010,Multiparty signed/agreed,9,InterIntra,Imp,...,0,0,1,0,0,0,1,1,1,1.0
4,Afghanistan,Government,Europe and Eurasia,708,The Resolution Adopted at the Conclusion of th...,06/06/2010,Multiparty signed/agreed,4,Intra,Pre,...,0,0,1,0,0,0,0,0,1,


In [839]:
# Check for missing values as they will skew the regression
print(df.shape)

(1518, 241)


In [840]:
data = df[['Agt','Imp','TjAm','TjRSym','Ce']]

In [841]:
# What are the possible features?
df = pd.DataFrame(data, columns=['Agt','Imp','TjAm','TjRSym','Ce'])
#testdf=pd.DataFrame(X_scaled, columns=[1,2,3])
df

Unnamed: 0,Agt,Imp,TjAm,TjRSym,Ce
0,Tokyo Declaration Partnership for Self-Relianc...,1.0,0,0,0
1,Conclusions of the Conference on Afghanistan a...,,0,0,0
2,Istanbul Process on Regional Security and Coop...,1.0,0,0,0
3,Renewed Commitment by the Afghan Government to...,1.0,0,0,0
4,The Resolution Adopted at the Conclusion of th...,,2,0,0
5,Communiqué of the Conference on Afghan Leaders...,1.0,0,0,0
6,Statement of the International Conference on A...,0.0,0,0,0
7,Declaration of the Special Conference on Afgha...,1.0,0,0,0
8,Declaration of the International Conference in...,1.0,0,0,0
9,Rome Conference on Justice and Rule of Law in ...,1.0,0,0,0


In [842]:
df['Ce'].value_counts()

0    823
2    363
1    196
3    136
Name: Ce, dtype: int64

In [843]:
df = df.rename(columns={'Agt':'Agreement','Imp': 'Implementation', 'TjAm':'Amnesty','TjRSym':'SymbolicReparation','Ce':'Ceasefire'})
df.isnull().sum()
df.dropna(inplace=True)
df['Implementation']

0       1.0
2       1.0
3       1.0
5       1.0
6       0.0
7       1.0
8       1.0
9       1.0
10      0.0
11      1.0
12      0.0
13      1.0
14      0.0
16      0.0
17      0.0
18      0.0
19      0.0
20      1.0
21      0.0
22      0.0
25      0.0
26      1.0
27      1.0
30      1.0
31      0.0
32      0.0
33      0.0
34      0.0
35      0.0
36      0.0
       ... 
1483    0.0
1484    0.0
1485    0.0
1486    0.0
1487    0.0
1488    0.0
1489    0.0
1490    0.0
1491    0.0
1492    1.0
1493    0.0
1494    0.0
1496    1.0
1499    0.0
1500    1.0
1501    0.0
1503    0.0
1504    1.0
1505    1.0
1506    0.0
1507    1.0
1508    0.0
1509    0.0
1510    0.0
1511    0.0
1512    0.0
1513    0.0
1514    0.0
1515    0.0
1516    0.0
Name: Implementation, Length: 1051, dtype: float64

In [844]:
df['Implementation']=df['Implementation'].astype(np.int64)

In [845]:
# Select our features
X = df[['Amnesty', 'SymbolicReparation', 'Ceasefire']]
y = df['Implementation']

### Modeling

In [846]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=42)

In [847]:
X_train.columns

Index(['Amnesty', 'SymbolicReparation', 'Ceasefire'], dtype='object')

In [848]:
print(X_train['Amnesty'].value_counts())
print(X_train['Ceasefire'].value_counts())
print(X_train['SymbolicReparation'].value_counts())
print(y_train.value_counts())

0    691
2     89
3     31
1     29
Name: Amnesty, dtype: int64
0    446
2    187
3    105
1    102
Name: Ceasefire, dtype: int64
0    833
1      7
Name: SymbolicReparation, dtype: int64
0    611
1    229
Name: Implementation, dtype: int64


In [849]:
print(X_test['Amnesty'].value_counts())
print(X_test['Ceasefire'].value_counts())
print(X_test['SymbolicReparation'].value_counts())
print(y_test.value_counts())

0    176
2     22
1      8
3      5
Name: Amnesty, dtype: int64
0    106
2     56
3     26
1     23
Name: Ceasefire, dtype: int64
0    206
1      5
Name: SymbolicReparation, dtype: int64
0    152
1     59
Name: Implementation, dtype: int64


In [850]:
gnb = GaussianNB()
# Fit on the training data
gnb_model = gnb.fit(X_train, y_train)
# Predict on the testing data
predictions=gnb_model.predict(X_test)
probabilities = gnb_model.predict_proba(X_test)[:,1]
# Calculate the roc-auc score
auc_nb=metrics.roc_auc_score(y_test, predictions)
acc_nb = metrics.accuracy_score(y_test, predictions)
f1_nb = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_nb,4))
print('Accuracy', "%.4f" % round(acc_nb,4))
print('AUC Score', "%.4f" % round(auc_nb,4))

F1 Score 0.0317
Accuracy 0.7109
AUC Score 0.4986


In [851]:
knn = KNeighborsClassifier(n_neighbors=7)
# Fit on the training data
knn_model=knn.fit(X_train, y_train)
# Predict on the testing data
predictions=knn_model.predict(X_test)
probabilities = knn_model.predict_proba(X_test)[:,1]
# Calculate the roc-auc score
auc_knn=metrics.roc_auc_score(y_test, predictions)
acc_knn = metrics.accuracy_score(y_test, predictions)
f1_knn = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_knn,4))
print('Accuracy', "%.4f" % round(acc_knn,4))
print('AUC Score', "%.4f" % round(auc_knn,4))

F1 Score 0.0615
Accuracy 0.7109
AUC Score 0.5038


In [852]:
predictions[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [853]:
rf = RandomForestClassifier()
# Fit on the training data
rf_model=rf.fit(X_train, y_train)
# Predict on the testing data
predictions=rf_model.predict(X_test)
probabilities = rf_model.predict_proba(X_test)[:,1]
# Calculate the roc-auc score
auc_rf=metrics.roc_auc_score(y_test, predictions)
acc_rf = metrics.accuracy_score(y_test, predictions)
f1_rf = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_rf,4))
print('Accuracy', "%.4f" % round(acc_rf,4))
print('AUC Score', "%.4f" % round(auc_rf,4))

F1 Score 0.0323
Accuracy 0.7156
AUC Score 0.5019



The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.



In [854]:
logreg = LogisticRegression()
# Fit on the training data
log_model=logreg.fit(X_train, y_train)
# Predict on the testing data
predictions=log_model.predict(X_test)
probabilities = log_model.predict_proba(X_test)[:,1]
# Calculate the roc-auc score
auc_log=metrics.roc_auc_score(y_test, predictions)
acc_log = metrics.accuracy_score(y_test, predictions)
f1_log = metrics.f1_score(y_test, predictions)
# Display
print('F1 Score', "%.4f" % round(f1_log,4))
print('Accuracy', "%.4f" % round(acc_log,4))
print('AUC Score', "%.4f" % round(auc_log,4))

F1 Score 0.0000
Accuracy 0.7204
AUC Score 0.5000





F-score is ill-defined and being set to 0.0 due to no predicted samples.



In [855]:
predictions[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [856]:
list(y_test[:10])

[1, 0, 1, 0, 0, 0, 0, 1, 1, 1]

### Comparison of Four Models

In [857]:
# create lists from the metrics we produced.
f1=[f1_nb, f1_log, f1_knn, f1_rf]
acc=[acc_nb, acc_log, acc_knn, acc_rf]
auc=[auc_nb, auc_log, auc_knn, auc_rf]
# Define a function that will round our metrics.
def rounder(metric):
    scores_list=[]
    for score in metric:
        scores_list.append(round(float(score*100),1))
    return scores_list
# Apply it to each of the three lists.
f1_scores=rounder(f1)
acc_scores=rounder(acc)
auc_scores=rounder(auc)
score_types=['F1 score', 'Accuracy', 'AUC score']

In [858]:
# Comparison of model metrics
models=['naive bayes', 'logistic regression', 'k-nearest neighbors', 'random forest']
index=['F1 score', 'Accuracy', 'AUC score']
compare_models=pd.DataFrame([f1_scores, acc_scores, auc_scores], index=index, columns=models)
# save to pickle, for later use by plotly dash app.
compare_models.to_pickle('resources/compare_models.pkl')

In [859]:
# Let's display that with plotly.
mydata1 = go.Bar(
    x=compare_models.loc['F1 score'].index,
    y=compare_models.loc['F1 score'],
    name=compare_models.index[0],
    marker=dict(color=Viridis[50])
)
mydata2 = go.Bar(
    x=compare_models.loc['Accuracy'].index,
    y=compare_models.loc['Accuracy'],
    name=compare_models.index[1],
    marker=dict(color=Viridis[30])
)
mydata3 = go.Bar(
    x=compare_models.loc['AUC score'].index,
    y=compare_models.loc['AUC score'],
    name=compare_models.index[2],
    marker=dict(color=Viridis[10])
)
mylayout = go.Layout(
    title='Comparison of Possible Models',
    xaxis = dict(title = 'Predictive models'), # x-axis label
    yaxis = dict(title = 'Score'), # y-axis label
    
)
fig = go.Figure(data=[mydata1, mydata2, mydata3], layout=mylayout)
iplot(fig)

## Tuning the Logistic Classifier
Note: The gridsearch step is included here for completeness sake, as this is a smart inclusion in any iteration of possible models. But for the sake of speed (this notebook was run multiple times during development) I've kept my gridsearch to a bare-bones placeholder. A more complete project would use a broader grid.

In [860]:
# Create regularization penalty space (l1=ridge, l2=lasso)
penalty = ['l1', 'l2'] 

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# Create grid search using 5-fold cross validation
grid_lr = GridSearchCV(LogisticRegression(), hyperparameters, cv=5,  n_jobs = 1, verbose=0)
grid_lr.fit(X_train, y_train)















































































































































































































GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=1,
             param_grid={'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
       5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
       3.59381366e+03, 1.00000000e+04]),
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
      

In [861]:
print(grid_lr.best_params_)

log_model = grid_lr

{'C': 1.0, 'penalty': 'l1'}


In [862]:
# Predict on the testing data

predictions=log_model.predict(X_test)
probabilities = log_model.predict_proba(X_test)[:,1]

In [863]:
# Pickle the final model for use in the plotly dash app.
file = open('resources/final_logreg_model.pkl', 'wb')
pickle.dump(log_model, file)
file.close()

## Final Model Metrics

In [864]:
# Full list of metrics
def model_metrics(y_test, predictions):
    '''
    Calculate 5 standard model metrics
    Return a dictionary with the metrics
    '''
    f1 = metrics.f1_score(y_test, predictions)
    accuracy = metrics.accuracy_score(y_test, predictions)
    error = 1 - accuracy
    precision = metrics.precision_score(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)
    rocauc =  metrics.roc_auc_score(y_test, predictions)
    return {'precision': precision, 'recall': recall,'f1 score':f1, 'accuracy': accuracy, 'error rate': error,  'ROC-AUC': rocauc}

eval_scores=model_metrics(y_test, predictions)
eval_scores


F-score is ill-defined and being set to 0.0 due to no predicted samples.


Precision is ill-defined and being set to 0.0 due to no predicted samples.



{'precision': 0.0,
 'recall': 0.0,
 'f1 score': 0.0,
 'accuracy': 0.7203791469194313,
 'error rate': 0.2796208530805687,
 'ROC-AUC': 0.5}

In [865]:
# Round the y values.
y_vals=[]
for val in list(eval_scores.values()):
    y_vals.append(round(float(val*100),1))
y_vals    
# Write over the previous dictionary with the rounded values.
eval_scores=dict(zip(eval_scores.keys(), y_vals))
print(eval_scores)

{'precision': 0.0, 'recall': 0.0, 'f1 score': 0.0, 'accuracy': 72.0, 'error rate': 28.0, 'ROC-AUC': 50.0}


In [866]:
# Now save that dictionary to a pickle file, for later use in plotly dash app
file = open('resources/eval_scores.pkl', 'wb')
pickle.dump(eval_scores, file)
file.close()

In [867]:
# And here's a reminder of how to read that back in again, just in case this is unfamiliar:
file = open('resources/eval_scores.pkl', 'rb')
evals=pickle.load(file)
file.close()
evals

{'precision': 0.0,
 'recall': 0.0,
 'f1 score': 0.0,
 'accuracy': 72.0,
 'error rate': 28.0,
 'ROC-AUC': 50.0}

In [868]:
# Convert that into a visualization.
mydata = [go.Bar(
    x=list(evals.keys()),
    y=list(evals.values()),
    marker=dict(color=Viridis[::12])
)]

mylayout = go.Layout(
    title='Evaluation Metrics for Logistic Regression Model',
    xaxis = {'title': 'Metrics'},
    yaxis = {'title': 'Percent'}, 

)
fig = go.Figure(data=mydata, layout=mylayout)
iplot(fig)

In [869]:
FPR, TPR, _ = roc_curve(y_test, probabilities)
FPR

array([0.        , 0.34210526, 0.36842105, 0.43421053, 0.52631579,
       0.53947368, 0.55921053, 0.80921053, 0.81578947, 0.82236842,
       0.84868421, 0.96710526, 0.97368421, 0.99342105, 1.        ])

In [870]:
roc_score=round(100*roc_auc_score(y_test, predictions),1)
roc_score

50.0

In [871]:
# Pickle everything we need to reproduce the ROC-AUC figure in plotly dash.
file = open('resources/FPR.pkl', 'wb')
pickle.dump(FPR, file)
file.close()

file = open('resources/TPR.pkl', 'wb')
pickle.dump(TPR, file)
file.close()

file = open('resources/y_test.pkl', 'wb')
pickle.dump(y_test, file)
file.close()

file = open('resources/predictions.pkl', 'wb')
pickle.dump(predictions, file)
file.close()

In [872]:
# ROC-AUC figure

roc_score=round(100*roc_auc_score(y_test, predictions),1)
trace0=go.Scatter(
        x=FPR, 
        y=TPR,
        mode='lines',
        name=f'AUC: {roc_score}',
        marker=dict(color=Viridis[10])
        )
trace1=go.Scatter(
        x=[0,1], 
        y=[0,1],
        mode='lines',
        name='Baseline Area: 50.0',
    marker=dict(color=Viridis[50])
        )
layout=go.Layout(
    title='Receiver Operating Characteristic (ROC): Area Under Curve',
    xaxis={'title': 'False Positive Rate (100-Specificity)','scaleratio': 1,'scaleanchor': 'y'},
    yaxis={'title': 'True Positive Rate (Sensitivity)'}
    )
data=[trace0, trace1]
fig = dict(data=data, layout=layout)
iplot(fig)

In [873]:
len(y_test)

211

In [874]:
# A confusion matrix tells us our false positives and false negatives:
matrix=confusion_matrix(y_test, predictions)
print(matrix)
cm=pd.DataFrame(matrix, columns=['pred: Not Implemented', 'pred: Implemented'])
cm[f'n={len(y_test)}']=['actual: Not Implemented', 'actual: Implemented']
cm=cm[[f'n={len(y_test)}', 'pred: Not Implemented', 'pred: Implemented']]
cm

[[152   0]
 [ 59   0]]


Unnamed: 0,n=211,pred: Not Implemented,pred: Implemented
0,actual: Not Implemented,152,0
1,actual: Implemented,59,0


In [875]:
# Save cm dataframe to a pickle file, for later use in plotly dash app
cm.to_pickle('resources/confusion_matrix.pkl')

In [876]:
# Display the confusion matrix as a formatted table with Plotly
trace = go.Table(
    header=dict(values=cm.columns,
                line = dict(color='#7D7F80'),
                fill = dict(color=Viridis[55]),
                align = ['left'] * 5),
    cells=dict(values=[cm[f'n={len(y_test)}'], cm['pred: Not Implemented'], cm['pred: Implemented']],
               line = dict(color='#7D7F80'),
               fill = dict(color='white'),
               align = ['left'] * 5))

layout = go.Layout(
    title = f'Confusion Matrix: Logistic Regression Model (Testing Dataset)'
)

data = [trace]
fig = dict(data=data, layout=layout)
iplot(fig)

In [877]:
# Feature importance (Logistic Regression)
coeffs1=pd.DataFrame(list(zip(list(X_train.columns), logreg.coef_[0])), columns=['feature', 'coefficient'])
coeffs=coeffs1.sort_values(by='coefficient', ascending=False)

# Format the coefficients.
y_vals=[]
for val in list(coeffs['coefficient']):
    y_vals.append(round(float(val),2))
y_vals

coeffs['coefficient']=y_vals
coeffs

Unnamed: 0,feature,coefficient
1,SymbolicReparation,0.25
0,Amnesty,-0.28
2,Ceasefire,-0.62


In [878]:
# save the results to a csv file, for later use by plotly dash app.
coeffs.to_pickle('resources/coefficients.pkl')

In [879]:
# Let's display that with Plotly.
mydata = [go.Bar(
    x=coeffs['feature'],
    y=coeffs['coefficient'],
    marker=dict(color=Viridis[::-6])
)]

mylayout = go.Layout(
    title='Peaceagreement including amnesty and ceasefire had less odds of failure',
    xaxis = {'title': 'Passenger Features'},
    yaxis = {'title': 'Odds of Failure of Implementation'}, 

)
fig = go.Figure(data=mydata, layout=mylayout)
iplot(fig)

In [880]:
print(len(probabilities))
print(len(predictions))
print(len(y_test))
print(len(X_test))

211
211
211
211


In [881]:
X_test1=X_test1.reset_index(drop=True)
y_test=y_test.reset_index(drop=True)
probs=pd.DataFrame(probabilities, columns=['Failure_prob'])

In [882]:
# Merge back in the names
final=pd.concat([df['Agreement'],X_test1, y_test, probs], axis=1)

In [883]:
final.shape

(1110, 6)

In [884]:
final.head()

Unnamed: 0,Agreement,Amnesty,SymbolicReparation,Ceasefire,Implementation,Failure_prob
0,Tokyo Declaration Partnership for Self-Relianc...,0.0,0.0,0.0,1.0,0.159052
1,,0.0,0.0,0.0,0.0,0.159052
2,Istanbul Process on Regional Security and Coop...,0.0,0.0,0.0,1.0,0.260937
3,Renewed Commitment by the Afghan Government to...,0.0,0.0,0.0,0.0,0.397257
4,,0.0,0.0,0.0,0.0,0.397257


In [885]:
final.to_csv('resources/final_probs.csv', index=False)

In [886]:
list(final.columns)

['Agreement',
 'Amnesty',
 'SymbolicReparation',
 'Ceasefire',
 'Implementation',
 'Failure_prob']

In [887]:
#mydata=final.drop(['Survived', 'survival_prob'], axis=1)
#table=[go.Table(
       # header=dict(values=list(mydata.columns)),
      #  cells=dict(values=list(mydata.loc[5])))]
#iplot(table)

## Exploring individual predictions

In [888]:
value='Sharp, Mr. Percival James R'
Failure=final.loc[3, 'Failure_prob']
Failure

0.39725669547173537

In [889]:
names=df['Agreement'].values
names[:5]

array(['Tokyo Declaration Partnership for Self-Reliance in Afghanistan from Transition to Transformation (Tokyo Conference)',
       'Istanbul Process on Regional Security and Cooperation for a Secure and Stable Afghanistan',
       'Renewed Commitment by the Afghan Government to the Afghan People and the International Community to Afghanistan (Kabul Conference Communique)',
       'Communiqué of the Conference on Afghan Leadership, Regional Cooperation, International Partnership (London Conference Communique)',
       'Statement of the International Conference on Afghanistan (Hague Conference)'],
      dtype=object)

In [890]:
indexs=df['Agreement'].index.values
indexs[:5]

array([0, 2, 3, 5, 6])

In [891]:
names=df['Agreement'].values
index=df['Agreement'].index.values
nameslist = list(zip(indexs, names))
print(nameslist[5])
print(nameslist[5][0])
print(nameslist[5][1])

(7, 'Declaration of the Special Conference on Afghanistan Convened under the Auspices of the Shanghai Cooperation Organization (Moscow Declaration)')
7
Declaration of the Special Conference on Afghanistan Convened under the Auspices of the Shanghai Cooperation Organization (Moscow Declaration)


In [892]:
options=[{'label': k, 'value': i} for i,k in nameslist]
options[0]

{'label': 'Tokyo Declaration Partnership for Self-Reliance in Afghanistan from Transition to Transformation (Tokyo Conference)',
 'value': 0}

In [893]:
value=nameslist[0][0]
value

0

In [894]:
Failure=final.loc[value, 'Failure_prob']
round(Failure*100)

16.0

In [895]:
final.columns

Index(['Agreement', 'Amnesty', 'SymbolicReparation', 'Ceasefire',
       'Implementation', 'Failure_prob'],
      dtype='object')

## Predict on a single, individual row of data.

In [896]:
testset=final.drop(["Implementation", "Failure_prob", 'Agreement'], axis=1)
firstrow=testset.loc[0]
firstrow

Amnesty               0.0
SymbolicReparation    0.0
Ceasefire             0.0
Name: 0, dtype: float64

In [897]:
myarray=firstrow.values
myarray.shape

(3,)

In [898]:
thisarray=myarray.reshape((1, myarray.shape[0]))
thisarray.shape

(1, 3)

In [899]:
logreg.predict_proba(thisarray)

array([[0.60334306, 0.39665694]])

In [900]:
logreg.predict(thisarray)

array([0])

In [901]:
df.columns

Index(['Agreement', 'Implementation', 'Amnesty', 'SymbolicReparation',
       'Ceasefire'],
      dtype='object')

In [902]:
df['age2028']=np.where((df.age>=20)&(df.age<28))
df['age2838']=np.where((df.age>=28)&(df.age<38))
df['age3880']=np.where((df.age>=38)&(df.age<80))

AttributeError: 'DataFrame' object has no attribute 'age'