In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn import svm

In [2]:
all_df = pd.read_csv("data/weather.csv")

In [3]:
all_df=all_df.drop(['RISK_MM'], axis=1)

In [4]:
X=all_df.drop(['RainTomorrow'], axis=1).values.astype('float64')

In [5]:
Y = all_df['RainTomorrow'].values.astype('int64')

In [6]:
X = StandardScaler().fit_transform(X)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33, random_state=None) 
Xtrain[:2]

array([[-0.35909598, -0.48463997, -0.274179  ,  0.0662001 , -0.38241242,
         0.56686938, -0.79929041,  0.48975473,  0.80923437,  0.9612014 ,
        -0.09083448, -0.61156205, -0.53271769],
       [-0.5004385 , -0.74227781,  0.64725343,  1.40154183,  0.07232373,
        -0.47055898,  0.40306027,  1.06969745, -2.92397173, -2.82830689,
        -0.05982382, -1.34332908,  1.87716688]])

In [7]:
Ytrain[:2]

array([0, 1])

In [8]:
Y[:2]

array([0, 0])

In [None]:
C = 5.0
classifiers = {
               'RF5': RandomForestClassifier(n_estimators=5),
               'RF50': RandomForestClassifier(n_estimators=50),                
               'tree':DecisionTreeClassifier(criterion='gini',max_depth=5),
                'SVM':svm.SVC(C=5.0,random_state=0, kernel='rbf' ,probability=True)
              }

plt.figure(figsize=(8,8))
n_classifiers = len(classifiers)

In [None]:
for index, (name, clf) in enumerate(classifiers.items()):
    clf.fit(Xtrain, Ytrain)
    probs = clf.predict_proba(Xtest)
    fpr, tpr, thresholds = roc_curve(Ytest, probs[:, 1])
    roc_auc = auc(fpr, tpr)
    print ('For model', name, 'accuracy =', clf.score(Xtest,Ytest))
    plt.plot(fpr, tpr, label='%s (area = %0.2f)' % (name, roc_auc))
    
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

For model RF5 accuracy = 0.8358292157608007
For model tree accuracy = 0.8438109717471177
For model RF50 accuracy = 0.853338401114912


