# **DATA PREPARATION**

In [0]:
from sklearn.datasets import load_files
data = load_files('/content/bbc',shuffle=False,encoding='latin')

In [8]:
import pandas as pd
bbc_news_data = list(zip(data.data,data.target))
bbc_news_dataframe = pd.DataFrame(data = bbc_news_data, columns=['TEXT','LABEL'])
bbc_news_dataframe.head()

Unnamed: 0,TEXT,LABEL
0,Ad sales boost Time Warner profit\n\nQuarterly...,0
1,Dollar gains on Greenspan speech\n\nThe dollar...,0
2,Yukos unit buyer faces loan claim\n\nThe owner...,0
3,High fuel prices hit BA's profits\n\nBritish A...,0
4,Pernod takeover talk lifts Domecq\n\nShares in...,0


In [0]:
bbc_news_dataframe.to_csv('/content/drive/My Drive/Information_Retrieval/bbc_news_dataframe.csv')

In [1]:
import pandas as pd

bbc_news_dataframe = pd.read_csv('/content/drive/My Drive/Information_Retrieval/bbc_news_dataframe.csv',index_col=False)
bbc_news_dataframe.head()

Unnamed: 0.1,Unnamed: 0,TEXT,LABEL
0,0,Ad sales boost Time Warner profit\n\nQuarterly...,0
1,1,Dollar gains on Greenspan speech\n\nThe dollar...,0
2,2,Yukos unit buyer faces loan claim\n\nThe owner...,0
3,3,High fuel prices hit BA's profits\n\nBritish A...,0
4,4,Pernod takeover talk lifts Domecq\n\nShares in...,0


In [0]:
target_names = ['business', 'entertainment', 'politics', 'sport', 'tech']

In [0]:
from sklearn.model_selection import train_test_split
bbc_news_train, bbc_news_test, bbc_news_train_labels, bbc_news_test_labels = train_test_split(bbc_news_dataframe['TEXT'], bbc_news_dataframe['LABEL'],
                                                                                              stratify=bbc_news_dataframe['LABEL'], test_size=0.3,random_state = 10)

# **TF-IDF**

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorier = TfidfVectorizer(stop_words = 'english', min_df = 0)
bbc_news_train_vectors = tfIdfVectorier.fit_transform(bbc_news_train)
bbc_news_test_vectors = tfIdfVectorier.transform(bbc_news_test)

In [13]:
bbc_news_train_vectors.shape,bbc_news_test_vectors.shape

((1557, 25108), (668, 25108))

In [14]:
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import pandas as pd

svm = SVC(kernel = 'linear',random_state = 0)
param_grid = {'C':[10**-6,10**(-5),10**(-4),10**(-3),10**(-2),10**(-1),1,10,100,1000,10000]}
gridSearch = GridSearchCV(svm, param_grid,scoring = 'f1_micro',cv = 5)
gridSearch.return_train_score = True
gridSearch.fit(bbc_news_train_vectors,bbc_news_train_labels)

dataframe = pd.DataFrame(gridSearch.cv_results_)
dataframe[['param_C','mean_test_score','mean_train_score']]

Unnamed: 0,param_C,mean_test_score,mean_train_score
0,1e-06,0.273019,0.275355
1,1e-05,0.273019,0.275355
2,0.0001,0.273019,0.275355
3,0.001,0.273019,0.275355
4,0.01,0.273019,0.275355
5,0.1,0.782884,0.841684
6,1.0,0.974307,0.999197
7,10.0,0.974307,1.0
8,100.0,0.974307,1.0
9,1000.0,0.974307,1.0


In [0]:
bbc_news_test_labels = [ target_names[i]  for i in bbc_news_test_labels]

In [16]:
from sklearn.metrics import  classification_report

bbc_news_pred_labels = gridSearch.best_estimator_.predict(bbc_news_test_vectors)
bbc_news_pred_labels = [ target_names[i]  for i in bbc_news_pred_labels]

class_report = classification_report(bbc_news_test_labels, bbc_news_pred_labels, target_names=target_names, output_dict=True)
report = pd.DataFrame(data = class_report)
report

Unnamed: 0,business,entertainment,politics,sport,tech,accuracy,macro avg,weighted avg
precision,0.973856,0.974359,0.96748,1.0,0.975207,0.979042,0.97818,0.97902
recall,0.973856,0.982759,0.952,1.0,0.983333,0.979042,0.97839,0.979042
f1-score,0.973856,0.978541,0.959677,1.0,0.979253,0.979042,0.978266,0.979013
support,153.0,116.0,125.0,154.0,120.0,0.979042,668.0,668.0


In [17]:
from sklearn.metrics import precision_score

precision_score(bbc_news_test_labels, bbc_news_pred_labels, average='micro')

0.9790419161676647

In [18]:
precision_score(bbc_news_test_labels, bbc_news_pred_labels, average='macro')

0.9781802939752595

#**WORD2VEC**

In [4]:
import gensim.downloader as api
skipGramModel = api.load('word2vec-google-news-300')



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [5]:
from gensim.utils import simple_preprocess
import numpy as np

bbc_news_train_vectors = [] 
for text in bbc_news_train:
  words = simple_preprocess(text)
  vector = np.zeros((300,))
  count = 0
  for word in words:
    try:
      vector+=skipGramModel.wv.get_vector(word)
      count+=1
    except:
      pass
  bbc_news_train_vectors.append(vector/count)

bbc_news_test_vectors = [] 
for text in bbc_news_test:
  words = simple_preprocess(text)
  vector = np.zeros((300,))
  count = 0
  for word in words:
    try:
      vector+=skipGramModel.wv.get_vector(word)
      count+=1
    except:
      pass
  bbc_news_test_vectors.append(vector/count)

  # This is added back by InteractiveShellApp.init_path()


In [6]:
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import pandas as pd

svm = SVC(kernel = 'linear',random_state = 0)
param_grid = {'C':[10**-12,10**-10,10**-6,10**(-5),10**(-4),10**(-3),10**(-2),10**(-1),1,10,100,1000,10000,10**10,10**12]}
gridSearch = GridSearchCV(svm, param_grid,scoring = 'f1_micro',cv = 5)
gridSearch.return_train_score = True
gridSearch.fit(bbc_news_train_vectors,bbc_news_train_labels)

dataframe = pd.DataFrame(gridSearch.cv_results_)
dataframe[['param_C','mean_test_score','mean_train_score']]

Unnamed: 0,param_C,mean_test_score,mean_train_score
0,1e-12,0.273662,0.274552
1,1e-10,0.273662,0.274552
2,1e-06,0.273662,0.274552
3,1e-05,0.273662,0.274552
4,0.0001,0.273662,0.274552
5,0.001,0.273662,0.274552
6,0.01,0.273662,0.274552
7,0.1,0.911363,0.913937
8,1.0,0.951185,0.963392
9,10.0,0.965327,0.991169


In [0]:
bbc_news_test_labels = [ target_names[i]  for i in bbc_news_test_labels]

In [8]:
from sklearn.metrics import  classification_report

bbc_news_pred_labels = gridSearch.best_estimator_.predict(bbc_news_test_vectors)
bbc_news_pred_labels = [ target_names[i]  for i in bbc_news_pred_labels]
class_report = classification_report(bbc_news_test_labels, bbc_news_pred_labels, target_names=target_names, output_dict=True)
report = pd.DataFrame(data = class_report)
report

Unnamed: 0,business,entertainment,politics,sport,tech,accuracy,macro avg,weighted avg
precision,0.97973,0.982456,0.929688,1.0,0.951613,0.97006,0.968697,0.970461
recall,0.947712,0.965517,0.952,1.0,0.983333,0.97006,0.969713,0.97006
f1-score,0.963455,0.973913,0.940711,1.0,0.967213,0.97006,0.969059,0.970115
support,153.0,116.0,125.0,154.0,120.0,0.97006,668.0,668.0


In [10]:
from sklearn.metrics import precision_score
#Micro F1 Score
precision_score(bbc_news_test_labels, bbc_news_pred_labels, average='micro')

0.9700598802395209

In [11]:
#Macro F1 Score
precision_score(bbc_news_test_labels, bbc_news_pred_labels, average='macro')

0.9686972546612826