In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from math import log

In [None]:
#with google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#read data with Pandas
df=pd.read_csv("/content/drive/MyDrive/Fall'22/BBM409/deneme/English Dataset.csv")

In [None]:
df

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [None]:
from sklearn.utils import shuffle
df = shuffle(df, random_state = 45)

In [None]:
df

Unnamed: 0,ArticleId,Text,Category
374,2168,no re-draft for eu patent law a proposed euro...,tech
104,684,celebrities get to stay in jungle all four con...,entertainment
833,2188,jones files conte lawsuit marion jones has fil...,sport
339,739,us duo in first spam conviction a brother and ...,tech
1152,1911,hospital suspends no welsh plan an english h...,politics
...,...,...,...
580,1926,warning over us pensions deficit taxpayers may...,business
163,1769,curbishley delight for johansson charlton mana...,sport
607,1498,lib dems unveil election slogan the liberal de...,politics
414,1836,henman overcomes rival rusedski tim henman sav...,sport


In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
df["Category"] = le.fit_transform(df["Category"])


In [None]:
df["Category"]

374     4
104     1
833     3
339     4
1152    2
       ..
580     0
163     3
607     2
414     3
971     0
Name: Category, Length: 1490, dtype: int64

In [None]:
X = df["Text"]
y = df["Category"]

In [None]:
class NaiveBayes:
    def __init__(self):
      self.prior_prob = []
      self.num_text = []
      self.Unigram_Bow = []
      self.Bigram_Bow = []
      self.categories = []

    def train(self, X_train, y_train, stop_words=[]):
      self.X_train = X_train
      self.y_train = y_train
      self.train=pd.concat([X_train, y_train], axis=1)
      self.Unigram_Bow = self.calculate_Bow(self.train,1,stop_words)
      self.Bigram_Bow = self.calculate_Bow(self.train,2,stop_words)
      
      self.prior_prob = [0 for i in range(len(self.num_text))]                  # calculate prior probabilities of categories 
      for i in range(len(self.num_text)): 
        self.prior_prob[i] = log(self.num_text[i]/sum(self.num_text))           # category count/ all categories

    def train_TF_IDF(self, X_train, y_train,stop_words=[]):
      self.X_train = X_train
      self.y_train = y_train
      self.train=pd.concat([X_train, y_train], axis=1)
      self.Unigram_Bow = self.tf_idf(self.train,1,stop_words)
      self.Bigram_Bow = self.tf_idf(self.train,2,stop_words)
      
      self.prior_prob = [0 for i in range(len(self.num_text))]                  # calculate prior probabilities of categories 
      for i in range(len(self.num_text)):
        self.prior_prob[i] = log(self.num_text[i]/sum(self.num_text))           # category count/ all categories

    
    
    def predict(self, X_test, n_gram):
        y_pred=[]
        for x in X_test:
            y_pred.append(self.make_predict(x, n_gram))
        return y_pred

    # Creates BoW using CountVectorizer
    def calculate_Bow(self, train, n_gram,stop_words):
      self.categories = train["Category"].unique()
      self.num_text = [0 for i in range(len(self.categories))]
      bow = ["" for i in range(len(self.categories))]                           

      for category in range(len(self.categories)):                              # BoW will be an array of all strings in the category when the loop is over
        df_bow=train[(train["Category"]==category)] 
        bow[category] = df_bow["Text"]
        self.num_text[category] += len(df_bow)

      for i in range(len(bow)):
        vectorizer = CountVectorizer(lowercase = True, ngram_range= (n_gram,n_gram),stop_words=stop_words)    # create Vectorizer object
        count_matrix = vectorizer.fit_transform(bow[i])
        count_array = count_matrix.toarray()        
        bow_df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())  
        bow[i] = bow_df.sum(axis=0).to_dict()                                   # create dictionary with words
        
      self.unique_texts = {}

      for d in [bow[0], bow[1], bow[2], bow[3], bow[4]]:
        self.unique_texts.update(d)                       # we will use at log calculations

      return bow
    
    def tf_idf(self, train, n_gram,stop_words):
      self.categories = train["Category"].unique()
      self.num_text = [0 for i in range(len(self.categories))]
      bow = ["" for i in range(len(self.categories))]
      for category in range(len(self.categories)):
        df_bow=train[(train["Category"]==category)]
        bow[category] = df_bow["Text"]
        self.num_text[category] += len(df_bow)

      for i in range(len(bow)):
        # Create a Vectorizer Object
        vectorizer = TfidfVectorizer(lowercase=True,ngram_range=(n_gram,n_gram),stop_words=stop_words) # create Vectorizer object
        count_matrix = vectorizer.fit_transform(bow[i])
        count_array = count_matrix.toarray()
        bow_df = pd.DataFrame(data=count_array,columns = vectorizer.get_feature_names_out())
        bow[i] = bow_df.sum(axis=0).to_dict()          # create dictionary with words

        

      self.unique_texts = {}
      for d in [bow[0], bow[1], bow[2], bow[3], bow[4]]:    # we will use at log calculations
        self.unique_texts.update(d)
      return bow


    def make_predict(self, x, n_gram):
      category_score = [1 for i in range(len(self.categories))]
      if n_gram == 2:
        words = x.split()
        words = list(map(' '.join, zip(words[:-1], words[1:])))
        BoW = self.Bigram_Bow
      else:
        words = x.split(" ")
        BoW = self.Unigram_Bow
      for word in words:
        for i in range(len(category_score)):
          if word in BoW[i].keys():
            category_score[i] += log((BoW[i][word] + 1) /( sum(BoW[i].values()) + len(self.unique_texts.keys())))
          else:
            category_score[i] += log(1 / (sum(BoW[i].values()) + len(self.unique_texts.keys())))

      for i in range(len(category_score)):
        category_score[i] += self.prior_prob[i]
      return np.argmax(category_score)

    def get_most_presence_n_words(self,ngram,nWord):
      for i in range(len(self.Unigram_Bow)):
        print(le.inverse_transform([i])[0],":")
        if(ngram==1):
          print(sorted(self.Unigram_Bow[i].items(), key=lambda item: item[1] , reverse=True)[:nWord])
        else:
          print(sorted(self.Bigram_Bow[i].items(), key=lambda item: item[1] , reverse=True)[:nWord])

    def get_most_absence_n_words(self,ngram,nWord):
      for i in range(len(self.Unigram_Bow)):
        print(le.inverse_transform([i])[0],":")
        if(ngram==1):
          print(sorted(self.Unigram_Bow[i].items(), key=lambda item: item[1])[:nWord])
        else:
          print(sorted(self.Bigram_Bow[i].items(), key=lambda item: item[1])[:nWord])

    def get_list_of_words(self,words):
      for i in range(len(self.Unigram_Bow)):
        print(le.inverse_transform([i])[0],":")
        for key in words:
          print(key,"->",self.Unigram_Bow[i][key],"   ",end="")
        print()
      

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) 

# **Understanding Data**

*Here is some word with their occurence in all data can show as their categories.* 

We can see the occurrences of several words `"mr", "new", "year"`. If we want to make a comment, `"mr"` appears more in politics, that is, the word `"mr"` in political texts affects the classification. Since `"mr"` is used the least in sports, the word `"mr"` has little effect in sports text. For `"mr"` we can say that it is a word that has an effect on classification. However, the occurrences of the words `"new"` and `"year"` are almost the same for each category. We think that these words do not have a huge impact on categorization.

In [None]:
clf = NaiveBayes()
clf.train(X, y)
clf.get_list_of_words(["mr","new","year"])

business :
mr -> 393    new -> 273    year -> 456    
entertainment :
mr -> 151    new -> 234    year -> 315    
politics :
mr -> 1073    new -> 280    year -> 175    
sport :
mr -> 8    new -> 202    year -> 331    
tech :
mr -> 349    new -> 349    year -> 251    


# A) **Analyzing effect of the words on prediction**

**List the 10 words whose presence most strongly predicts that the article belongs to specific category for each five categories.**

In below we see 10 words whose presence most strongly predicts in Unigram for all categories.

All of them have some common words in presence most strongly predicts 10 words. These are;
`"the","to","of","in","and","for"`. However all of these are stopwords. We will estimate without stopwords next part.

In [None]:
clf = NaiveBayes()
clf.train_TF_IDF(X_train, y_train)
clf.get_most_presence_n_words(1,10)

business :
[('the', 70.95764876462664), ('to', 32.817908709006126), ('of', 28.381825584034942), ('in', 28.341438544205033), ('and', 21.49897057026608), ('said', 12.296416780507798), ('it', 11.707978624765063), ('that', 11.550008739780894), ('is', 11.539364304631231), ('for', 11.346785316842237)]
entertainment :
[('the', 57.419369042051514), ('and', 20.560072226707508), ('to', 20.442022091704242), ('in', 20.237870447845136), ('of', 19.55640512893384), ('for', 11.883231662140942), ('film', 10.201944728398576), ('was', 9.612219243218215), ('on', 9.368725763545408), ('he', 8.588226115086652)]
politics :
[('the', 75.04298296233469), ('to', 36.480312482345994), ('of', 26.479894877984233), ('and', 24.020639227563237), ('in', 20.167782173741294), ('he', 15.765589064105303), ('said', 15.05334039825951), ('mr', 13.216505626266827), ('for', 12.068284324934092), ('on', 12.0083662771412)]
sport :
[('the', 67.17049428380793), ('to', 33.000835379545556), ('in', 27.01756569922483), ('and', 26.03458002

In below we see 10 words whose presence most strongly predicts in Bigram for all categories.

All of them have some common words in absence most strongly predicts 10 words. These are;
`"of the","in the"`. However all of these are stopwords. We will estimate without stopwords next part. 

At **business** we can see `"the us","the company" `, at **entertainment** we can see `"the film"`, at politics we can see `"mr blair", "mr brown", "the government", "mr brown` in the most occurence 10 words. We know that this words related with their categories.

In [None]:
clf.get_most_presence_n_words(2,10)

business :
[('of the', 7.053731445048548), ('in the', 6.9666798859466175), ('the us', 4.333071282212917), ('for the', 3.9179350102273984), ('to the', 3.8436881223119026), ('on the', 3.7008120290487305), ('that the', 3.435768602930095), ('the company', 3.1611125893129244), ('said the', 3.0991054592390554), ('said it', 2.9581942291642505)]
entertainment :
[('of the', 6.011593587306114), ('in the', 5.814181828536041), ('at the', 3.724031523881116), ('for the', 3.2146830666348114), ('to the', 2.9649334339480546), ('will be', 2.9263098955654336), ('to be', 2.8682747685786842), ('on the', 2.866522527060242), ('the film', 2.8505923458559153), ('and the', 2.7566326619692525)]
politics :
[('of the', 6.468304102484044), ('in the', 5.392737166423687), ('to the', 4.472174297497518), ('mr blair', 4.1535517520662175), ('he said', 4.002772737377743), ('mr brown', 3.779606115821695), ('said the', 3.7641143338975933), ('the government', 3.6712174504812483), ('to be', 3.6304921911696693), ('on the', 3.4

**List the 10 words whose absence most strongly predicts that the article belongs to specific category for each five categories.**

We can see that this words so unrelated with their categories. In **sport** we clearly see these words are absence because these words corresponding year. Presumably, when we try to predict a text, it is very unlikely that the new text will include these years.

In [None]:
clf = NaiveBayes()
clf.train_TF_IDF(X_train, y_train)
clf.get_most_absence_n_words(1,10)

business :
[('accessing', 0.03171291265321466), ('adequate', 0.03171291265321466), ('boundary', 0.03171291265321466), ('bracing', 0.03171291265321466), ('brighten', 0.03171291265321466), ('calculate', 0.03171291265321466), ('carefully', 0.03171291265321466), ('chairs', 0.03171291265321466), ('checked', 0.03171291265321466), ('circuits', 0.03171291265321466)]
entertainment :
[('betting', 0.023885660921501378), ('certainty', 0.023885660921501378), ('comprised', 0.023885660921501378), ('darabont', 0.023885660921501378), ('flip', 0.023885660921501378), ('kane', 0.023885660921501378), ('outsiders', 0.023885660921501378), ('overwhelmingly', 0.023885660921501378), ('paper', 0.023885660921501378), ('points', 0.023885660921501378)]
politics :
[('20p', 0.010963905404179271), ('3rds', 0.010963905404179271), ('3x', 0.010963905404179271), ('5000', 0.010963905404179271), ('50pc', 0.010963905404179271), ('75p', 0.010963905404179271), ('80s', 0.010963905404179271), ('absorb', 0.010963905404179271), ('

-------------------------------------------------------------------------------------------

We can see that this pairs are so unrelated. Like `"000 years" `, `"12 more" `, `"and 68"`.

In [None]:
clf.get_most_absence_n_words(2,10)

business :
[('about your', 0.03449869478745337), ('accessing the', 0.03449869478745337), ('additional bracing', 0.03449869478745337), ('additional wireless', 0.03449869478745337), ('adequate ideally', 0.03449869478745337), ('advance as', 0.03449869478745337), ('advise on', 0.03449869478745337), ('allow good', 0.03449869478745337), ('allow you', 0.03449869478745337), ('allows you', 0.03449869478745337)]
entertainment :
[('1994 frank', 0.022496920764156634), ('2005 presenter', 0.022496920764156634), ('52 of', 0.022496920764156634), ('68 of', 0.022496920764156634), ('actress alongside', 0.022496920764156634), ('all expect', 0.022496920764156634), ('alongside kate', 0.022496920764156634), ('already seen', 0.022496920764156634), ('also revealed', 0.022496920764156634), ('and 68', 0.022496920764156634)]
politics :
[('00 mark', 0.01347228697352125), ('000 our', 0.01347228697352125), ('05 gives', 0.01347228697352125), ('05 is', 0.01347228697352125), ('05 the', 0.01347228697352125), ('10 for', 

# B) **Stop Words**

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
english_stopwords=list(ENGLISH_STOP_WORDS)
english_stopwords.append('said')
print(english_stopwords)

['show', 'becomes', 'four', 'noone', 'whereby', 'hereby', 'indeed', 'thence', 'still', 'own', 'anyhow', 're', 'through', 'before', 'there', 'due', 'whither', 'eight', 'hundred', 'fill', 'this', 'has', 'first', 'also', 'give', 'none', 'eg', 'empty', 'least', 'name', 'must', 'itself', 'hereafter', 'sometimes', 'cannot', 'or', 'few', 'please', 'moreover', 'further', 'we', 'eleven', 'either', 'found', 'via', 'un', 'and', 'the', 'not', 'made', 'beyond', 'those', 'when', 'rather', 'can', 'themselves', 'a', 'many', 'toward', 'etc', 'may', 'against', 'whether', 'becoming', 'both', 'each', 'elsewhere', 'other', 'hereupon', 'ten', 'are', 'became', 'whatever', 'by', 'throughout', 'whereas', 'amoungst', 'ever', 'whole', 'during', 'all', 'latterly', 'would', 'yours', 'ltd', 'how', 'three', 'together', 'no', 'more', 'whence', 'co', 'such', 'which', 'even', 'most', 'con', 'whose', 'every', 'meanwhile', 'its', 'others', 'above', 'mill', 'what', 'de', 'less', 'much', 'ie', 'system', 'for', 'same', 'thi

**List the 10 words whose presence most strongly predicts that the article belongs to specific category for each five categories.** 

**Without stop words!!!**

In below we see the most presence 10 words without stop words in Unigram for all categories.

We see business, politics and tech categories includes `"mr"` and all of them includes `"new"`. These words like stop words' behaviour. The other words related with their categories.

In [None]:
clf = NaiveBayes()
clf.train_TF_IDF(X_train, y_train,english_stopwords)
clf.get_most_presence_n_words(1,10)

business :
[('year', 7.885523855358925), ('mr', 7.524265899440198), ('sales', 6.604490735213493), ('growth', 6.482519702778017), ('economy', 6.30511773311268), ('oil', 5.857026406753185), ('bank', 5.82927728165229), ('market', 5.823054761944759), ('firm', 5.719463036419863), ('new', 5.558714949453762)]
entertainment :
[('film', 11.516880650933699), ('best', 9.209039605076942), ('year', 5.825435631126013), ('music', 5.388407158514384), ('band', 5.355127232993832), ('number', 5.276699275502582), ('awards', 4.81662088258036), ('award', 4.571490416508048), ('new', 4.460998858284408), ('actor', 4.279137954590733)]
politics :
[('mr', 15.610656703727146), ('labour', 10.314072500816552), ('election', 9.445650924996293), ('blair', 9.366086438709702), ('brown', 8.137938428556842), ('party', 8.012039320929732), ('government', 7.3125045282339185), ('people', 6.2103314542569334), ('tax', 6.136255378599537), ('howard', 5.798747874037895)]
sport :
[('england', 9.310548900628532), ('game', 7.499451211

In below we see the most presence 10 words without stop words in Bigram for all categories.

In here the words so correlated with their categories. (We think the special name `mr brown` or `mr ebbers` may be important name in their categories.)

In [None]:
clf.get_most_presence_n_words(2,10)

business :
[('chief executive', 2.239593040310824), ('economic growth', 1.8588226628662257), ('mr ebbers', 1.6257621463849823), ('deutsche boerse', 1.5743386640953765), ('oil prices', 1.5742343878153524), ('new york', 1.4914961773957542), ('mr glazer', 1.4502461528286679), ('stock market', 1.3808746765179782), ('fourth quarter', 1.2742919214848885), ('sri lanka', 1.1686755446278534)]
entertainment :
[('box office', 2.448913141675062), ('new york', 1.7008386102244137), ('year old', 1.5921039437910196), ('won best', 1.509565482699943), ('named best', 1.5004966885625757), ('vera drake', 1.4501183673485736), ('los angeles', 1.4159331142791591), ('million dollar', 1.400896114129235), ('dollar baby', 1.362335039819239), ('best actress', 1.3551563790385242)]
politics :
[('mr blair', 5.26336884540814), ('mr brown', 4.755965642136332), ('prime minister', 4.138005799647052), ('mr howard', 2.970734057762314), ('general election', 2.8292065477700152), ('tony blair', 2.734082412586757), ('kilroy si

-------------------------------------------------------------------------------------------

**List the 10 words whose absence most strongly predicts that the article belongs to specific category for each five categories.**

**Without stop words!!!**

We can see that this words so unrelated with their categories. We can give an example same as with stop words part. 

If we want to give another unrelated some words , these are the numbers such as `"118"`, `"925p"`, `"50pc"`. 

In [None]:
clf = NaiveBayes()
clf.train_TF_IDF(X_train, y_train,english_stopwords)
clf.get_most_absence_n_words(1,10)

business :
[('118', 0.039385010089065996), ('44bn', 0.039385010089065996), ('925p', 0.039385010089065996), ('accretive', 0.039385010089065996), ('branch', 0.039385010089065996), ('caught', 0.039385010089065996), ('comfortable', 0.039385010089065996), ('conservatively', 0.039385010089065996), ('crippled', 0.039385010089065996), ('davies', 0.039385010089065996)]
entertainment :
[('betting', 0.025140694296576355), ('certainty', 0.025140694296576355), ('comprised', 0.025140694296576355), ('darabont', 0.025140694296576355), ('flip', 0.025140694296576355), ('kane', 0.025140694296576355), ('outsiders', 0.025140694296576355), ('overwhelmingly', 0.025140694296576355), ('paper', 0.025140694296576355), ('points', 0.025140694296576355)]
politics :
[('20p', 0.013798751654267033), ('3rds', 0.013798751654267033), ('3x', 0.013798751654267033), ('5000', 0.013798751654267033), ('50pc', 0.013798751654267033), ('75p', 0.013798751654267033), ('80s', 0.013798751654267033), ('absorb', 0.013798751654267033), 

Non-stop words absence most of pairs included numbers in all categories.

In [None]:
clf.get_most_absence_n_words(2,10)

business :
[('100 decline', 0.040262963262219044), ('100 kfb', 0.040262963262219044), ('118 million', 0.040262963262219044), ('16 future', 0.040262963262219044), ('1997 asia', 0.040262963262219044), ('1999 kfb', 0.040262963262219044), ('2006 standard', 0.040262963262219044), ('22 group', 0.040262963262219044), ('28 pence', 0.040262963262219044), ('40 economy', 0.040262963262219044)]
entertainment :
[('1994 frank', 0.024006727991581312), ('2005 presenter', 0.024006727991581312), ('52 online', 0.024006727991581312), ('68 text', 0.024006727991581312), ('ability flip', 0.024006727991581312), ('according uk', 0.024006727991581312), ('actor actress', 0.024006727991581312), ('actress alongside', 0.024006727991581312), ('actress ray', 0.024006727991581312), ('alongside kate', 0.024006727991581312)]
politics :
[('00 mark', 0.017935720871240974), ('000 losses', 0.017935720871240974), ('000 minimum', 0.017935720871240974), ('000 raising', 0.017935720871240974), ('000 unions', 0.017935720871240974

**Analyzing effect of the stopwords: Why might it make sense to remove stop words when interpreting the model? Why might it make sense to keep stop words?**

In unigram remove stop-words is so effective. Because most of text include stop-words for predict we mostly need unique words. 

However in bigram it can effect bad. Because if we want to give text with stop-word in bigram, the model can't recognize some pairs like "won best" because this pair give in model like -> "won the" and "the best".

# **Accuracy**

*With stop words*

In [None]:
clf = NaiveBayes()
clf.train(X_train, y_train)
y_pred = clf.predict(X_test, 1)
accuracy = accuracy_score(y_test, y_pred)
print("Unigram accuracy:",accuracy)

Unigram accuracy: 0.9194630872483222


In [None]:
y_pred = clf.predict(X_test, 2)
accuracy = accuracy_score(y_test, y_pred)
print("Bigram accuracy:",accuracy)

Bigram accuracy: 0.9463087248322147


*With stop words TF-IDF*

In [None]:
clf = NaiveBayes()
clf.train_TF_IDF(X_train, y_train)
y_pred = clf.predict(X_test, 1)
accuracy = accuracy_score(y_test, y_pred)
print("TF-IDF unigram accuracy",accuracy)

TF-IDF unigram accuracy 0.8322147651006712


In [None]:
y_pred = clf.predict(X_test, 2)
accuracy = accuracy_score(y_test, y_pred)
print("TF-IDF bigram accuracy",accuracy)

TF-IDF bigram accuracy 0.8993288590604027


*No stop words*

In [None]:
clf = NaiveBayes()
clf.train(X_train, y_train,english_stopwords)
y_pred = clf.predict(X_test, 1)
accuracy = accuracy_score(y_test, y_pred)
print("Non-stopword unigram accuracy:",accuracy)

Non-stopword unigram accuracy: 0.959731543624161


In [None]:
y_pred = clf.predict(X_test, 2)
accuracy = accuracy_score(y_test, y_pred)
print("Non-stopword bigram accuracy:",accuracy)

Non-stopword bigram accuracy: 0.7516778523489933


*No stop words TF-IDF*

In [None]:
clf = NaiveBayes()
clf.train_TF_IDF(X_train, y_train,english_stopwords)
y_pred = clf.predict(X_test, 1)
accuracy = accuracy_score(y_test, y_pred)
print("TF-IDF unigram accuracy",accuracy)

TF-IDF unigram accuracy 0.9463087248322147


In [None]:
y_pred = clf.predict(X_test, 2)
accuracy = accuracy_score(y_test, y_pred)
print("TF-IDF bigram accuracy",accuracy)

TF-IDF bigram accuracy 0.9530201342281879


**With Stop Words**

Unigram accuracy: 0.9194630872483222

Bigram accuracy: 0.9463087248322147

**With Stop Words and With tf-idf**

TF-IDF unigram accuracy 0.8322147651006712

TF-IDF bigram accuracy 0.8993288590604027

**Without Stop Words**

Non-stopword unigram accuracy: 0.959731543624161

Non-stopword bigram accuracy: 0.7516778523489933

**Without Stop Words and With tf-idf**

TF-IDF unigram accuracy 0.9463087248322147

TF-IDF bigram accuracy 0.9530201342281879


**1) The least accuracy :** 

***Non-stopword bigram accuracy: 0.7516778523489933***

We see that the least accuracy in without stopword bigram, if we try to examine it we see that;


entertainment :
('won best', 37), ('named best', 36)

politics :
('told bbc', 86)

sport :
('told bbc', 50), ('australian open', 48)

tech :
('told bbc', 54) ('news website', 47)

These are unrelated pairs.

Another reason is that we are not removing stopwords from the test data. This will be resolved if we remove the stopwords from the test data and send them.

**2) The best accuracy :**

***Non-stopword unigram accuracy: 0.959731543624161 and non-stopword TF-IDF bigram accuracy 0.9530201342281879***

Without stopwords we can reach most related words in unigram so the accuracy is highest. 


For non-stopword TF-IDF bigram accuracy;

**business :**

('chief executive'), ('economic growth'), ('mr ebbers'), ('deutsche boerse'), ('oil prices'), ('new york'), ('mr glazer'), ('stock market'), ('fourth quarter'), ('sri lanka')

**entertainment :**

('box office'), ('new york), ('vera drake'), ('los angeles'), ('million dollar'), ('dollar baby'), ('best actress')

**politics :**

('mr blair'), ('mr brown'), ('prime minister'), ('mr howard'), ('general election'), ('tony blair'), ('kilroy silk'), ('mr kennedy'), ('lib dems')

**sport :**

('champions league'), ('davis cup'), ('grand slam'), ('new zealand'), ('world cup'), ('cross country'), ('manchester united)

**tech :**

('mobile phone'), ('mobile phones'), ('anti virus'), ('high definition'), ('bbc news'), ('ask jeeves'), ('consumer electronics'), ('wi fi')


These words are very related and their ratios are usually close to each other when compared to the size of the ratios in the with stop word tf-idf. And they don't need any stopword in between pairs so we don't need to drop the stopwords in test texts.

***3) Why doesn't TF-IDF improve accuracy?***

We think that it cause of the tf-idf ratio. In this example ;

business :

[('the', 70.95764876462664), ('to', 32.817908709006126), ('of', 28.381825584034942), ('in', 28.341438544205033), ('and', 21.49897057026608), ('said', 12.296416780507798), ('it', 11.707978624765063), ('that', 11.550008739780894), ('is', 11.539364304631231), ('for', 11.346785316842237)]

"the" has very large ratio when we compare with "for". Then the more related words can't effective in predict. However without stopwords unigram the ratio more close each other;

business :

[('year', 7.885523855358925), ('mr', 7.524265899440198), ('sales', 6.604490735213493), ('growth', 6.482519702778017), ('economy', 6.30511773311268), ('oil', 5.857026406753185), ('bank', 5.82927728165229), ('market', 5.823054761944759), ('firm', 5.719463036419863), ('new', 5.558714949453762)]








[0]