## Classification model for predicting funded status - SVM

In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
from sklearn.metrics import accuracy_score
from sklearn import linear_model, svm, metrics
from sklearn.utils.class_weight import compute_sample_weight

df = pd.read_csv('filelocation')
df = df[['CTOnum','FemNum', 'TeamSolo', 'dist', 'Funded']]

## Training & Test Set

In [5]:
#randomly split up train and test
numitems = len(df)
percenttrain = 0.6
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain

#scramble up the indices in the df so that it is random
df = df.sample(frac=1).reset_index(drop=True)
traindf = df[0:numtrain]
testdf = df[numtrain:]
traindf = traindf.reset_index(drop=True)
testdf = testdf.reset_index(drop=True)

## Setting weights

In [6]:
y = traindf['Funded']
weight = compute_sample_weight(class_weight='balanced', y=y)

## The Model - Support Vector Machine

In [14]:
X = traindf[['CTOnum','FemNum', 'TeamSolo', 'dist']]
y = traindf['Funded']

clf = svm.SVC(probability=True)
clf.fit(X, y)  
predictions = clf.predict_proba(X)

In [15]:
features = ['CTOnum', 'FemNum', 'TeamSolo', 'dist']

#use probabilities of predictions, in order to optimize for recall
pred =  clf.predict_proba(testdf[features])

In [16]:
#convert prediction probabilities to 0's and 1's using threshold of 0.7 instead of default 0.5
#if first column < .7, then put 0
#if second column > .7, put as 1
#if pred[[1]] > .7, 1 else 0
pred = pd.DataFrame(pred)
predictions = (pred[[0]] < 0.7).astype('int')
predictions=predictions.values

## Calculate Accuracy

In [None]:
numtrain = len(traindf)
numtest = len(testdf)
numyes = len(testdf[testdf['Funded'] == 1])
numno = len(testdf[testdf['Funded'] == 0])
correct = 0
misclassification = 0
true_positive = 0 #aka sensitivitity or recall
false_positive = 0 #type 1 error
specificity = 0 #when its no, how often does it say no; true negatives
false_negative = 0 #type 2 error
predicted_yes = 0

for i in range(numtest):
    #print('Predicted:', predictions[i], ' Actual:', testdf.loc[numtrain+i]['category'])
    if predictions[i] == testdf.loc[i]['Funded']: correct +=1
    if predictions[i] != testdf.loc[i]['Funded']: misclassification +=1
    if predictions[i] == 1 and testdf.loc[i]['Funded'] ==1: true_positive +=1
    if predictions[i] == 1 and testdf.loc[i]['Funded'] ==0: false_positive +=1 
    if predictions[i] == 0 and testdf.loc[i]['Funded'] ==0: specificity +=1
    if predictions[i] == 0 and testdf.loc[i]['Funded'] ==1: false_negative +=1
    if predictions[i] == 1: predicted_yes +=1

print('Accuracy:', float(correct)/float(numtest))
print('Misclassification/Error Rate', float(misclassification)/float(numtest))
print('True Positive Rate/Recall...most important', float(true_positive)/float(numyes), true_positive, numyes)
print('False Positive Rate', float(false_positive)/float(numno))
print('Specificity/True Negatives', float(specificity)/float(numno))
print('Precision/correct when predict yes', float(true_positive)/float(predicted_yes), true_positive, predicted_yes)
print('Prevalence', float(numyes)/float(numtest))