In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt


## Q1 SMS Spam Collection

In [None]:
urls=['https://media.geeksforgeeks.org/wp-content/uploads/20240515170038/spam.csv',
      'https://raw.githubusercontent.com/epfml/ML_course/master/labs/ex11_adaboost/spam.csv',
      'https://raw.githubusercontent.com/justmarkham/pydata-book/master/datasets/sms_spam/spam.csv']
for u in urls:
    try:
        df=pd.read_csv(u, encoding='latin-1')
        break
    except:
        df=None
if df is None:
    print('CSV not found; please place spam.csv in working dir')
else:
    if 'v1' in df.columns:
        df=df.rename(columns={'v1':'label','v2':'text'})
    df=df[['label','text']]
    df['y']=df['label'].map({'ham':0,'spam':1})
    X_text=df['text']
    tf=TfidfVectorizer(lowercase=True,stop_words='english')
    X=tf.fit_transform(X_text)
    X_train,X_test,y_train,y_test=train_test_split(X,df['y'],test_size=0.2,random_state=42,stratify=df['y'])
    print('Loaded SMS dataset, shape',df.shape)


In [None]:
stump=DecisionTreeClassifier(max_depth=1, random_state=0)
stump.fit(X_train,y_train)
print('train',accuracy_score(y_train,stump.predict(X_train)))
print('test',accuracy_score(y_test,stump.predict(X_test)))
print(confusion_matrix(y_test,stump.predict(X_test)))


### Manual AdaBoost

In [None]:
def manual_adaboost(X_train,y_train,X_test,T=15):
    n=X_train.shape[0]
    w=np.ones(n)/n
    learners=[]
    alphas=[]
    errs=[]
    for t in range(T):
        clf=DecisionTreeClassifier(max_depth=1, random_state=42)
        clf.fit(X_train,y_train,sample_weight=w)
        pred=clf.predict(X_train)
        miss=(pred!=y_train).astype(int)
        err=np.sum(w*miss)/np.sum(w)
        if err==0:
            alpha=1
        else:
            alpha=0.5*np.log((1-err)/err)
        w=w*np.exp(alpha*(miss*2-1))
        w=w/w.sum()
        learners.append(clf)
        alphas.append(alpha)
        errs.append(err)
        mis_idx=np.where(miss==1)[0]
        print('iter',t+1,'err',round(err,4),'alpha',round(alpha,4),'mis_count',len(mis_idx))
    def predict_ensemble(X):
        agg=np.zeros(X.shape[0])
        for a,clf in zip(alphas,learners):
            agg+=a*(clf.predict(X)*2-1)
        return (agg>0).astype(int)
    ytr_pred=predict_ensemble(X_train)
    yte_pred=predict_ensemble(X_test)
    print('train',accuracy_score(y_train,ytr_pred))
    print('test',accuracy_score(y_test,yte_pred))
    print(confusion_matrix(y_test,yte_pred))
    plt.figure();plt.plot(range(1,T+1),errs);plt.xlabel('iter');plt.ylabel('weighted error');plt.show()
    plt.figure();plt.plot(range(1,T+1),alphas);plt.xlabel('iter');plt.ylabel('alpha');plt.show()
manual_adaboost(X_train,y_train,X_test,T=15)


### Sklearn AdaBoost

In [None]:
ab=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=100,learning_rate=0.6,random_state=0)
ab.fit(X_train,y_train)
print('train',accuracy_score(y_train,ab.predict(X_train)))
print('test',accuracy_score(y_test,ab.predict(X_test)))
print(confusion_matrix(y_test,ab.predict(X_test)))


## Q2 Heart Disease

In [None]:
heart_urls=['https://raw.githubusercontent.com/dev-kudli/heart-disease-prediction/dev/heart.csv',
            'https://raw.githubusercontent.com/anshuldhingra/heart-disease-prediction/master/heart.csv',
            'https://raw.githubusercontent.com/auribises/Heart-Disease-Dataset/master/heart.csv']
for u in heart_urls:
    try:
        heart=pd.read_csv(u)
        break
    except:
        heart=None
if heart is None:
    print('heart CSV not found; please place heart.csv in working dir')
else:
    if 'target' in heart.columns:
        y=heart['target']
        X=heart.drop(columns=['target'])
    elif 'HeartDisease' in heart.columns:
        y=heart['HeartDisease']
        X=heart.drop(columns=['HeartDisease'])
    else:
        y=heart.iloc[:,-1]
        X=heart.iloc[:,:-1]
    X_train_h,X_test_h,y_train_h,y_test_h=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)
    stump_h=DecisionTreeClassifier(max_depth=1,random_state=0)
    stump_h.fit(X_train_h,y_train_h)
    print('stump train',accuracy_score(y_train_h,stump_h.predict(X_train_h)))
    print('stump test',accuracy_score(y_test_h,stump_h.predict(X_test_h)))
    print(confusion_matrix(y_test_h,stump_h.predict(X_test_h)))


In [None]:
n_list=[5,10,25,50,100]
lr_list=[0.1,0.5,1.0]
res={}
for lr in lr_list:
    accs=[]
    for n in n_list:
        m=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=n,learning_rate=lr,random_state=0)
        m.fit(X_train_h,y_train_h)
        accs.append(accuracy_score(y_test_h,m.predict(X_test_h)))
    res[lr]=accs
for lr,accs in res.items():
    plt.plot(n_list,accs,label=f'lr={lr}')
plt.xlabel('n_estimators');plt.ylabel('accuracy');plt.legend();plt.show()
best_lr,max_acc=None,0
for lr,accs in res.items():
    m=max(accs)
    if m>max_acc:
        max_acc=m;best_lr=lr;best_n=n_list[accs.index(m)]
print('best',best_lr,best_n,max_acc)
best=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=best_n,learning_rate=best_lr,random_state=0)
best.fit(X_train_h,y_train_h)
print('test',accuracy_score(y_test_h,best.predict(X_test_h)))
errs=[]
for est in best.estimators_:
    pred=est.predict(X_train_h)
    errs.append((pred!=y_train_h).mean())
plt.plot(range(1,len(errs)+1),errs);plt.xlabel('iter');plt.ylabel('weak error');plt.show()
importances=best.feature_importances_
inds=np.argsort(importances)[::-1][:5]
print('top5 features', list(X.columns[inds]))


## Q3 WISDM accelerometer

In [None]:
wisdm_urls=['https://raw.githubusercontent.com/soham97/WISD_HAR_files/master/WISDM_ar_v1.1_raw.txt',
           'https://raw.githubusercontent.com/laxmimerit/WISDM_datasets/master/WISDM_ar_v1.1_raw.txt']
for u in wisdm_urls:
    try:
        txt=pd.read_csv(u,header=None,encoding='latin-1')
        break
    except:
        txt=None
if txt is None:
    print('WISDM not found; place WISDM_ar_v1.1_raw.txt in working dir')
else:
    txt=txt[0].str.replace(';','',regex=False)
    parts=txt.str.split(',',expand=True)
    parts.columns=['user','activity','timestamp','x','y','z']
    parts=parts.dropna()
    parts['x']=parts['x'].astype(float)
    parts['y']=parts['y'].astype(float)
    parts['z']=parts['z'].astype(float)
    mapping={'Jogging':1,'Upstairs':1,'Walking':0,'Sitting':0,'Standing':0,'Downstairs':0,'Jog':1,'Up':1}
    parts['label']=parts['activity'].map(mapping).fillna(0).astype(int)
    features=parts[['x','y','z']]
    X_train_w,X_test_w,y_train_w,y_test_w=train_test_split(features,parts['label'],test_size=0.3,random_state=42,stratify=parts['label'])
    stump_w=DecisionTreeClassifier(max_depth=1,random_state=0)
    stump_w.fit(X_train_w,y_train_w)
    print('stump train',accuracy_score(y_train_w,stump_w.predict(X_train_w)))
    print('stump test',accuracy_score(y_test_w,stump_w.predict(X_test_w)))
    def manual_adaboost_tab(X_train,y_train,X_test,T=20):
        n=X_train.shape[0]
        w=np.ones(n)/n
        learners=[]
        alphas=[]
        errs=[]
        Xn=X_train.values if hasattr(X_train,'values') else X_train
        for t in range(T):
            clf=DecisionTreeClassifier(max_depth=1)
            clf.fit(Xn,y_train,sample_weight=w)
            pred=clf.predict(Xn)
            miss=(pred!=y_train).astype(int)
            err=np.sum(w*miss)/np.sum(w)
            alpha=0.5*np.log((1-err)/err) if err>0 else 1
            w=w*np.exp(alpha*(miss*2-1))
            w=w/w.sum()
            learners.append(clf);alphas.append(alpha);errs.append(err)
        def pred_ens(X):
            agg=np.zeros(X.shape[0])
            for a,clf in zip(alphas,learners):
                agg+=a*(clf.predict(X)*2-1)
            return (agg>0).astype(int)
        print('train',accuracy_score(y_train,pred_ens(Xn)))
        print('test',accuracy_score(y_test_w,pred_ens(X_test_w.values)))
    manual_adaboost_tab(X_train_w.reset_index(drop=True),y_train_w.reset_index(drop=True),X_test_w,T=20)
