### Gaussian Naive Bayes Classfier

In [1]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [4]:
iris = datasets.load_iris()

In [5]:
data = iris.data
target = iris.target

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(data,target,shuffle=True)

In [12]:
clf = GaussianNB()

In [13]:
clf.fit(X_train,Y_train) #Here we specify features as the first argument and labels as second argument

GaussianNB(priors=None)

In [15]:
pred = clf.predict(X_test)

In [17]:
accuracy = accuracy_score(Y_test,pred) * 100
print("Accuracy: ",accuracy)

Accuracy:  92.10526315789474


In accuracy_score we mention the true target values and predicted values

### Support Vector Machines

Breast cancer dataset with two class labels is used here from sklearn 

In [14]:
from sklearn import svm
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

In [6]:
data = load_breast_cancer()
print(data.target_names)

['malignant' 'benign']


In [11]:
X = data.data
Y = data.target

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,shuffle=True)

In [16]:
clf = svm.SVC(kernel="linear")
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)

In [17]:
print(accuracy_score(Y_test,pred) * 100)

94.4055944055944


##### Trying out iris dataset here to compare Naive Bayes and SVM 

In [1]:
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [2]:
data = datasets.load_iris()
X = data.data
Y = data.target

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,shuffle=True)

In [4]:
from sklearn import svm
clf = svm.SVC(kernel="linear")
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)

In [5]:
print(accuracy_score(Y_test,pred) * 100)

94.73684210526315


This proves that SVM is better than Gaussian Naive Bayes classifier

#### Trying out kernel and gamma functions

In [6]:
from sklearn import svm
clf = svm.SVC(kernel="linear",gamma=1.0)

In [7]:
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred))

1.0


#### Trying out different values of C

In [10]:
clf = svm.SVC(C=100.0)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred))

0.9473684210526315


In [11]:
clf = svm.SVC(C=10.0)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred))

1.0


In [12]:
clf = svm.SVC(C=1000.0)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred))

0.9210526315789473


Accuracy is reducing as the C value increases

### Decision trees

In [2]:
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score

  return f(*args, **kwds)


In [3]:
clf = tree.DecisionTreeClassifier()
data = load_wine()

In [6]:
from sklearn.model_selection import train_test_split
X = data.data
Y = data.target
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,shuffle=True)
clf.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred) * 100)

91.11111111111111


Now change min_samples_split parameter to reduce overfitting. Default is 2

In [11]:
clf1 = tree.DecisionTreeClassifier(min_samples_split=100)
clf1.fit(X_train, Y_train)
pred = clf1.predict(X_test)
print(accuracy_score(Y_test,pred) * 100)

60.0


Now change the criterion to entropy instead of gini which is the default

In [12]:
clf2 = tree.DecisionTreeClassifier(criterion="entropy")
clf2.fit(X_train, Y_train)
pred = clf2.predict(X_test)
print(accuracy_score(Y_test,pred) * 100)

93.33333333333333


### Regression

Boston housing price dataset is used here

In [2]:
from sklearn.datasets import load_boston
data = load_boston()

In [3]:
X = data.data
Y = data.target
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=True)

In [20]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
clf.coef_

array([-1.15131510e-01,  6.27282384e-02, -2.95565820e-02,  3.30119016e+00,
       -1.88055547e+01,  3.49818932e+00,  7.11364009e-03, -1.77103096e+00,
        3.37105315e-01, -1.31437203e-02, -9.04062946e-01,  9.52361118e-03,
       -5.38593360e-01])

In [22]:
clf.intercept_

39.153357099698766

The above values  indicate the coefficients of different features as well as intercept value. The value below give R^2 value that gives the accuracy of the regression line. This is done using score function whose parameters are features and labels of the test set

In [23]:
pred = clf.predict(X_test)

In [24]:
clf.score(X_test, Y_test)

0.7168553881865195

In [25]:
from sklearn.metrics import r2_score, mean_squared_error
print(r2_score(Y_test,pred))
print(mean_squared_error(Y_test,pred))

0.7168553881865195
22.52142529595035


Changing the parameter to observe change in the model that we created

In [10]:
clf1 = LinearRegression(fit_intercept=False).fit(X_train,Y_train)
print(clf1.coef_, clf1.intercept_)

[-9.36607572e-02  6.44075520e-02 -4.04097134e-02  3.25133893e+00
 -3.45914909e+00  5.83455089e+00 -3.17625150e-03 -1.17439789e+00
  2.02008430e-01 -1.03152142e-02 -3.29431645e-01  1.58450681e-02
 -4.05186334e-01] 0.0


In [11]:
clf1.score(X_test, Y_test)

0.6951865174442696