In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.utils import shuffle
from sklearn import svm

In [3]:
data = pd.read_csv("data/target.csv")

In [4]:
data=data.drop(['Unnamed: 7'], axis=1)
data=data.drop(['customer_id'], axis=1)
data = data.dropna(how='any')

In [5]:
data['res_type']= data['res_type'].map({'CN':0, 'CO': 1, 'RE': 2, 'TO': 3,'SI': 4}).astype(int) 

In [6]:
data=pd.get_dummies(data=data,columns=['geo_group'])

In [7]:
X = data.drop(['res_type'], axis=1).values.astype('float64')

In [8]:
Y = data['res_type'].values.astype('int64')

In [9]:
X = StandardScaler().fit_transform(X)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33, random_state=None) 
Xtrain[:2]

array([[-0.76974145,  0.36200515, -0.08219273,  0.11550278, -0.89195783,
         1.67584816, -0.40086982, -0.4297587 ],
       [ 0.8753427 , -1.66903253, -0.08219273, -0.26128642,  1.12112923,
        -0.59671277, -0.40086982, -0.4297587 ]])

In [10]:
Ytrain[:2]

array([1, 3])

In [11]:
Y[:2]

array([0, 0])

In [None]:
C = 5.0
classifiers = {
               'RF5': RandomForestClassifier(n_estimators=5),
               'RF50': RandomForestClassifier(n_estimators=50),                
               'tree':DecisionTreeClassifier(criterion='gini',max_depth=5),
                'SVM':svm.SVC(C=5.0,random_state=0, kernel='rbf' ,probability=True)
              }

plt.figure(figsize=(8,8))
n_classifiers = len(classifiers)

In [None]:
for index, (name, clf) in enumerate(classifiers.items()):
    clf.fit(Xtrain, Ytrain)
    probs = clf.predict_proba(Xtest)
    fpr, tpr, thresholds = roc_curve(Ytest, probs[:, 1])
    roc_auc = auc(fpr, tpr)
    print ('For model', name, 'accuracy =', clf.score(Xtest,Ytest))
    plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc))
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()