In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [2]:
df=pd.read_csv("D:/d downloads/spam.csv",encoding="latin-1")[["v1","v2"]]
df.columns=["label","text"]
df["label"]=df["label"].map({"ham":0,"spam":1})
X=df["text"]
y=df["label"]
vec=TfidfVectorizer(lowercase=True,stop_words="english")
Xv=vec.fit_transform(X)
Xtr,Xte,ytr,yte=train_test_split(Xv,y,test_size=0.2,random_state=1,stratify=y)
print(y.value_counts())

label
0    4825
1     747
Name: count, dtype: int64


In [3]:
stump=DecisionTreeClassifier(max_depth=1)
stump.fit(Xtr,ytr)
p1=stump.predict(Xtr)
p2=stump.predict(Xte)
print(accuracy_score(ytr,p1))
print(accuracy_score(yte,p2))
print(confusion_matrix(yte,p2))

0.8923042405205295
0.885201793721973
[[961   5]
 [123  26]]


In [4]:
T=15
n=Xtr.shape[0]
w=np.ones(n)/n
alphas=[]
errors=[]
models=[]

In [5]:
for t in range(T):
    m=DecisionTreeClassifier(max_depth=1)
    m.fit(Xtr,ytr,sample_weight=w)
    pred=m.predict(Xtr)
    mis=(pred!=ytr).astype(int)
    err=np.dot(w,mis)/w.sum()
    alpha=0.5*np.log((1-err)/err)
    w=w*np.exp(alpha*mis)
    w=w/w.sum()
    alphas.append(alpha)
    errors.append(err)
    models.append(m)
    print(t,np.where(mis==1)[0],w[mis==1],alpha)

def predict_boost(models,alphas,X):
    s=np.zeros(X.shape[0])
    for a,m in zip(alphas,models):
        s+=a*(m.predict(X)*2-1)
    return (s>0).astype(int)

0 [   2    4   11   19   29   39   46   58   64   77   84  101  109  114
  123  125  128  140  147  152  157  158  159  161  172  182  183  187
  199  235  273  280  339  342  347  355  359  362  363  364  368  373
  404  411  414  415  419  423  433  443  452  464  467  468  469  475
  477  485  487  495  511  526  539  548  553  571  576  598  606  611
  630  633  639  641  644  652  653  675  681  693  700  729  756  761
  774  781  787  817  839  844  851  859  864  869  873  876  891  908
  913  917  936  942  952  954  956  960  966  967  970  992 1001 1004
 1012 1026 1031 1037 1056 1062 1066 1073 1094 1119 1128 1133 1142 1174
 1177 1198 1206 1208 1214 1226 1241 1256 1258 1260 1280 1290 1291 1294
 1296 1312 1320 1334 1373 1392 1408 1432 1465 1476 1482 1489 1503 1549
 1554 1561 1565 1568 1602 1610 1615 1618 1623 1624 1634 1637 1642 1643
 1653 1680 1683 1693 1719 1726 1729 1734 1741 1745 1749 1764 1766 1788
 1797 1817 1827 1830 1831 1838 1845 1854 1855 1858 1860 1862 1864 1866
 189

In [6]:
ptr=predict_boost(models,alphas,Xtr)
pte=predict_boost(models,alphas,Xte)
print(accuracy_score(ytr,ptr))
print(accuracy_score(yte,pte))
print(confusion_matrix(yte,pte))

0.8835539600628225
0.8798206278026905
[[966   0]
 [134  15]]


In [7]:
from sklearn.ensemble import AdaBoostClassifier
adb=AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=0.6)
adb.fit(Xtr,ytr)
p1=adb.predict(Xtr)
p2=adb.predict(Xte)
print(accuracy_score(ytr,p1))
print(accuracy_score(yte,p2))
print(confusion_matrix(yte,p2))



0.9860892977339017
0.9730941704035875
[[964   2]
 [ 28 121]]
