### Gaussian Naive Bayes Classfier

In [1]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [4]:
iris = datasets.load_iris()

In [5]:
data = iris.data
target = iris.target

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(data,target,shuffle=True)

In [12]:
clf = GaussianNB()

In [13]:
clf.fit(X_train,Y_train) #Here we specify features as the first argument and labels as second argument

GaussianNB(priors=None)

In [15]:
pred = clf.predict(X_test)

In [17]:
accuracy = accuracy_score(Y_test,pred) * 100
print("Accuracy: ",accuracy)

Accuracy:  92.10526315789474


In accuracy_score we mention the true target values and predicted values

### Support Vector Machines

Breast cancer dataset with two class labels is used here from sklearn 

In [14]:
from sklearn import svm
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

In [6]:
data = load_breast_cancer()
print(data.target_names)

['malignant' 'benign']


In [11]:
X = data.data
Y = data.target

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,shuffle=True)

In [16]:
clf = svm.SVC(kernel="linear")
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)

In [17]:
print(accuracy_score(Y_test,pred) * 100)

94.4055944055944


##### Trying out iris dataset here to compare Naive Bayes and SVM 

In [1]:
from sklearn import datasets
from sklearn.metrics import accuracy_score

In [2]:
data = datasets.load_iris()
X = data.data
Y = data.target

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,shuffle=True)

In [4]:
from sklearn import svm
clf = svm.SVC(kernel="linear")
clf.fit(X_train,Y_train)
pred = clf.predict(X_test)

In [5]:
print(accuracy_score(Y_test,pred) * 100)

94.73684210526315


This proves that SVM is better than Gaussian Naive Bayes classifier

#### Trying out kernel and gamma functions

In [6]:
from sklearn import svm
clf = svm.SVC(kernel="linear",gamma=1.0)

In [7]:
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred))

1.0


#### Trying out different values of C

In [10]:
clf = svm.SVC(C=100.0)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred))

0.9473684210526315


In [11]:
clf = svm.SVC(C=10.0)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred))

1.0


In [12]:
clf = svm.SVC(C=1000.0)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred))

0.9210526315789473


Accuracy is reducing as the C value increases

### Decision trees

In [2]:
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score

  return f(*args, **kwds)


In [3]:
clf = tree.DecisionTreeClassifier()
data = load_wine()

In [6]:
from sklearn.model_selection import train_test_split
X = data.data
Y = data.target
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,shuffle=True)
clf.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
pred = clf.predict(X_test)
print(accuracy_score(Y_test,pred) * 100)

91.11111111111111


Now change min_samples_split parameter to reduce overfitting. Default is 2

In [11]:
clf1 = tree.DecisionTreeClassifier(min_samples_split=100)
clf1.fit(X_train, Y_train)
pred = clf1.predict(X_test)
print(accuracy_score(Y_test,pred) * 100)

60.0


Now change the criterion to entropy instead of gini which is the default

In [12]:
clf2 = tree.DecisionTreeClassifier(criterion="entropy")
clf2.fit(X_train, Y_train)
pred = clf2.predict(X_test)
print(accuracy_score(Y_test,pred) * 100)

93.33333333333333


### Regression

Boston housing price dataset is used here

In [15]:
from sklearn.datasets import load_boston
data = load_boston()

In [16]:
X = data.data
Y = data.target
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=True)

In [5]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X_train, Y_train)

  return f(*args, **kwds)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [6]:
clf.coef_

array([-1.08145708e-01,  4.75961473e-02,  3.45582117e-02,  1.85340177e+00,
       -1.91454181e+01,  4.08054438e+00,  8.35665587e-03, -1.39447535e+00,
        3.24422926e-01, -1.29041924e-02, -9.50119158e-01,  1.11923737e-02,
       -5.05584733e-01])

In [7]:
clf.intercept_

33.59326449328111

The above values  indicate the coefficients of different features as well as intercept value. The value below give R^2 value that gives the accuracy of the regression line. This is done using score function whose parameters are features and labels of the test set

In [8]:
pred = clf.predict(X_test)

In [9]:
clf.score(X_test, Y_test)

0.7371116930084971

In [10]:
from sklearn.metrics import r2_score, mean_squared_error
print(r2_score(Y_test,pred))
print(mean_squared_error(Y_test,pred))

0.7371116930084971
22.141219924847373


Changing the parameter to observe change in the model that we created

In [10]:
clf1 = LinearRegression(fit_intercept=False).fit(X_train,Y_train)
print(clf1.coef_, clf1.intercept_)

[-9.36607572e-02  6.44075520e-02 -4.04097134e-02  3.25133893e+00
 -3.45914909e+00  5.83455089e+00 -3.17625150e-03 -1.17439789e+00
  2.02008430e-01 -1.03152142e-02 -3.29431645e-01  1.58450681e-02
 -4.05186334e-01] 0.0


In [11]:
clf1.score(X_test, Y_test)

0.6951865174442696

#### Using Lasso regression

In [17]:
from sklearn.linear_model import Lasso
clf = Lasso(alpha=0.1)
clf.fit(X_train, Y_train)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [18]:
clf.coef_

array([-9.38223060e-02,  5.28050179e-02, -3.69405917e-03,  0.00000000e+00,
       -0.00000000e+00,  3.22836331e+00, -2.19981022e-03, -1.15956771e+00,
        2.43271577e-01, -1.40458579e-02, -7.69906186e-01,  9.00057256e-03,
       -6.17697321e-01])

In [None]:
clf.score

### Feature Scaling

Python code for scaling the features in the interval [0,1]

In [1]:
import numpy as np
arr = np.random.randint(0,100,size=20)

In [2]:
new_arr = []
x_min = min(arr)
x_max = max(arr)
print("Old array: ", arr)
for i in arr:
    new_val = (i - x_min)/(x_max - x_min)
    new_arr.append(new_val)
print("Re-scaled array: ",new_arr)

Old array:  [19 29 47 25 77 49 45 27 11 15 82  8 97 64  1 76  4 65 20 94]
Re-scaled array:  [0.1875, 0.2916666666666667, 0.4791666666666667, 0.25, 0.7916666666666666, 0.5, 0.4583333333333333, 0.2708333333333333, 0.10416666666666667, 0.14583333333333334, 0.84375, 0.07291666666666667, 1.0, 0.65625, 0.0, 0.78125, 0.03125, 0.6666666666666666, 0.19791666666666666, 0.96875]


In [3]:
from sklearn.preprocessing import MinMaxScaler
old_arr = arr.astype(float)
old_arr = np.reshape(old_arr,(-1, 1))
scaler = MinMaxScaler()
rescaled_arr = scaler.fit_transform(old_arr)
print(rescaled_arr)

[[0.1875    ]
 [0.29166667]
 [0.47916667]
 [0.25      ]
 [0.79166667]
 [0.5       ]
 [0.45833333]
 [0.27083333]
 [0.10416667]
 [0.14583333]
 [0.84375   ]
 [0.07291667]
 [1.        ]
 [0.65625   ]
 [0.        ]
 [0.78125   ]
 [0.03125   ]
 [0.66666667]
 [0.19791667]
 [0.96875   ]]


### Text Learning

20_newsgroups dataset is being used here

In [1]:
from sklearn.datasets import fetch_20newsgroups
train = fetch_20newsgroups(subset="train")
test = fetch_20newsgroups(subset="test")

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(train.data)

In [3]:
from nltk.corpus import stopwords
sw = stopwords.words("english") #The parameter is the langauge

In [4]:
len(sw)

179

In [5]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [6]:
stemmer.stem("happiness")

'happi'

In [7]:
stemmer.stem("happily")

'happili'

In [8]:
stemmer.stem("happy")

'happi'

In [9]:
stemmer.stem("unhappy")

'unhappi'

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.6,stop_words="english")
train_transformed = vectorizer.fit_transform(train.data)
test_transformed = vectorizer.transform(test.data)

selector = SelectPercentile(f_classif, percentile=10)
selector.fit(train_transformed,train.target)
train_selected = selector.transform(train_transformed).toarray()
test_selected = selector.transform(test_transformed).toarray()

In [11]:
train_transformed.shape

(11314, 129792)

In [12]:
train_selected.shape

(11314, 12980)

In [13]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_selected,train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix
predicted = model.predict(test_selected)
print("Accuracy: ",accuracy_score(test.target, predicted))
print(confusion_matrix(test.target,predicted))

Accuracy:  0.8139936271906533
[[221   2   0   1   0   0   0   0   0   2   1   5   1   5   4  61   4   8
    1   3]
 [  1 283  13  14   9  31   2   1   1   4   4  11   3   0   5   3   3   1
    0   0]
 [  1  22 289  43   4  11   1   0   1   3   0  10   0   0   5   3   1   0
    0   0]
 [  0  10  23 308  15   3   7   4   1   0   1   2  14   0   4   0   0   0
    0   0]
 [  0   4  13  33 307   1   8   0   0   2   1   3   8   0   3   0   2   0
    0   0]
 [  1  33  12  15   1 319   1   0   1   0   0   5   1   0   5   0   1   0
    0   0]
 [  0   3   2  27  12   0 318   9   3   0   3   1   6   4   1   1   0   0
    0   0]
 [  1   1   1   5   0   1   6 358   7   2   3   1   5   0   1   0   2   1
    1   0]
 [  0   0   0   1   0   0   4   9 377   0   1   2   1   2   0   1   0   0
    0   0]
 [  0   1   0   0   1   0   4   2   1 366  20   0   0   0   0   1   0   1
    0   0]
 [  0   1   0   0   0   0   0   0   0   5 391   0   0   0   1   1   0   0
    0   0]
 [  0   4   2   0   3   2   1   2  