# **DATA PREPARATION**

In [0]:
from sklearn.datasets import load_files
data = load_files('/content/bbc',shuffle=False,encoding='latin')

In [0]:
import pandas as pd
bbc_news_data = list(zip(data.data,data.target))
bbc_news_dataframe = pd.DataFrame(data = bbc_news_data, columns=['TEXT','LABEL'])
bbc_news_dataframe.head()

Unnamed: 0,TEXT,LABEL
0,Ad sales boost Time Warner profit\n\nQuarterly...,0
1,Dollar gains on Greenspan speech\n\nThe dollar...,0
2,Yukos unit buyer faces loan claim\n\nThe owner...,0
3,High fuel prices hit BA's profits\n\nBritish A...,0
4,Pernod takeover talk lifts Domecq\n\nShares in...,0


In [0]:
bbc_news_dataframe.to_csv('/content/drive/My Drive/Information_Retrieval/bbc_news_dataframe.csv')

In [0]:
import pandas as pd

bbc_news_dataframe = pd.read_csv('/content/drive/My Drive/Information_Retrieval/bbc_news_dataframe.csv',index_col=False)
bbc_news_dataframe.head()

Unnamed: 0.1,Unnamed: 0,TEXT,LABEL
0,0,Ad sales boost Time Warner profit\n\nQuarterly...,0
1,1,Dollar gains on Greenspan speech\n\nThe dollar...,0
2,2,Yukos unit buyer faces loan claim\n\nThe owner...,0
3,3,High fuel prices hit BA's profits\n\nBritish A...,0
4,4,Pernod takeover talk lifts Domecq\n\nShares in...,0


In [0]:
from sklearn.model_selection import train_test_split
bbc_news_train, bbc_news_test, bbc_news_train_labels, bbc_news_test_labels = train_test_split(bbc_news_dataframe['TEXT'], bbc_news_dataframe['LABEL'],
                                                                                              stratify=bbc_news_dataframe['LABEL'], test_size=0.3,random_state = 10)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorier = TfidfVectorizer(stop_words = 'english', min_df = 0)
bbc_news_train_vectors = tfIdfVectorier.fit_transform(bbc_news_train)
bbc_news_test_vectors = tfIdfVectorier.transform(bbc_news_test)

In [0]:
bbc_news_train_vectors.shape,bbc_news_test_vectors.shape

((1557, 25108), (668, 25108))

# **VOTING CLASSIFIER**

In [0]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

clf1 = LogisticRegression(C = 10, random_state=1)
clf2 = RandomForestClassifier(n_estimators=100,random_state=1)
clf3 = MultinomialNB(alpha = 0.1)

eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('mnb', clf3)], voting='hard')

In [58]:
eclf1.fit(bbc_news_train_vectors, bbc_news_train_labels)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=10, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=1, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                 

In [59]:
from sklearn.metrics import  classification_report
from sklearn.metrics import accuracy_score
bbc_news_pred_labels = eclf1.predict(bbc_news_test_vectors)
accuracy_score(bbc_news_test_labels, bbc_news_pred_labels) 

0.9850299401197605

# **STACKING CLASSIFIER**

In [67]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

clf1 = LogisticRegression(C = 10, random_state=1)
clf2 = RandomForestClassifier(n_estimators=100,random_state=1)
clf3 = MultinomialNB(alpha = 0.1)

clf = StackingClassifier(estimators = [('lr',clf1),('rfc',clf2),('mnb',clf3)] , final_estimator = RandomForestClassifier(n_estimators=100,random_state=1) )
clf.fit(bbc_news_train_vectors.toarray(), bbc_news_train_labels)

StackingClassifier(cv=None,
                   estimators=[('lr',
                                LogisticRegression(C=10, class_weight=None,
                                                   dual=False,
                                                   fit_intercept=True,
                                                   intercept_scaling=1,
                                                   l1_ratio=None, max_iter=100,
                                                   multi_class='auto',
                                                   n_jobs=None, penalty='l2',
                                                   random_state=1,
                                                   solver='lbfgs', tol=0.0001,
                                                   verbose=0,
                                                   warm_start=False)),
                               ('rfc',
                                RandomForestClassifier(bootstrap=True,
                                  

In [68]:
from sklearn.metrics import  classification_report
from sklearn.metrics import accuracy_score
bbc_news_pred_labels = clf.predict(bbc_news_test_vectors)
accuracy_score(bbc_news_test_labels, bbc_news_pred_labels) 

0.9865269461077845