In [3]:
# Python Code Snippet for selecting best model learning model

# import the dataset
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits
# import the variuos ML models from sklearn library
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
# import parameter search model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

digits = load_digits()

model_params = {'svm':{'model':svm.SVC(gamma='auto'),'params':{'C':[1,10,20],'kernel':['rbf','linear']}},
               'random_forest':{'model':RandomForestClassifier(), 'params':{'n_estimators':[1,5,10]}},
               'logistic_regression':{'model':LogisticRegression(solver='liblinear', multi_class ='auto'), 'params':{'C':[1,5,10]}},
               'GaussianNB':{'model':GaussianNB(), 'params':{'var_smoothing':[2e-9]}},
               'MultinomialNB':{'model':MultinomialNB(), 'params':{'alpha':[0.6,1,2]}},
               'DecisionTreeClassifier':{'model':DecisionTreeClassifier(), 'params':{'criterion':['gini','entropy']}}}

scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({'model':model_name, 'best_score':clf.best_score_, 'best_params':clf.best_params_})
    
df = pd.DataFrame(scores, columns=['model', 'best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.892639,{'n_estimators': 10}
2,logistic_regression,0.922114,{'C': 1}
3,GaussianNB,0.814157,{'var_smoothing': 2e-09}
4,MultinomialNB,0.871464,{'alpha': 2}
5,DecisionTreeClassifier,0.813607,{'criterion': 'entropy'}


In [4]:
# Python Code Snippet for spam detection 

import pandas as pd
df = pd.read_csv('spam.csv')
df.groupby('Category').describe()
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
#df.drop(df['Category'], axis=1)
print(df.head())

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.Message, df.spam, test_size=0.25)

from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
x_train_count = v.fit_transform(x_train.values)
x_train_count.toarray()[:3]

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_count, y_train)

x_test_count = v.transform(x_test)
model.score(x_test_count, y_test)

#using a pipeline to embed the text conversion and classsification
from sklearn.pipeline import Pipeline
clf = Pipeline([('vectorizer', CountVectorizer()), ('nb',MultinomialNB())])
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

  Category                                            Message  spam
0      ham  Go until jurong point, crazy.. Available only ...     0
1      ham                      Ok lar... Joking wif u oni...     0
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...     1
3      ham  U dun say so early hor... U c already then say...     0
4      ham  Nah I don't think he goes to usf, he lives aro...     0


0.9834888729361091