In [49]:
import pandas as pd
import math
import numpy as np
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import WhitespaceTokenizer
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

In [2]:
data = pd.read_csv("BBC News Train.csv")
test_data = pd.read_csv("BBC News Test.csv")
ground_truth = pd.read_csv("BBC News Sample Solution.csv")

In [3]:
data

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [4]:
def pre_process(data):
    corpus = []
    for i in range(len(data)):
        text = data.iloc[i]["Text"]
        # Lowercasing text
        text = text.lower()
        
        # Tokenizing text
        token = WhitespaceTokenizer()
        tokenized_text = token.tokenize(text)
        
        lemmatizer = WordNetLemmatizer()
        #removing stop words (and punctuation)
        stop_words = stopwords.words('english')
        puncs = string.punctuation
        for word in puncs:
            stop_words.append(word)
        
        no_stop_text = []
        for word in tokenized_text:
            if(word not in stop_words):
                # appending the lemmatized non-stop word
                no_stop_text.append(lemmatizer.lemmatize(word))
        
        corpus.append(no_stop_text)
    return corpus

In [5]:
def split_data(X,Y , train_split_size = 70):
    length = len(X)
    
    # Shuffling both arrays in same permutation
    arr1 = np.array(X , dtype = object)
    arr2 = np.array(Y)
    shuffler = np.random.permutation(len(X))
    X = arr1[shuffler]
    Y = arr2[shuffler]
    
    # Calculating split size
    train_size = int(length * train_split_size / 100)
    
    # Splitting dataset wrt asked size
    X_train = X[:train_size]
    Y_train = Y[:train_size]
    
    X_test = X[train_size:]
    Y_test = Y[train_size:]
    
    return X_train , Y_train , X_test , Y_test

In [6]:
# Pre-processing data and seperating labels

del data["ArticleId"]
labels = data["Category"]
pre_processed_data = pre_process(data)

In [7]:
X_train , Y_train , X_test , Y_test = split_data(pre_processed_data , labels , 70)

In [8]:
len(X_train) , len(Y_train) , len(X_test) , len(Y_test)

(1043, 1043, 447, 447)

In [9]:
# We will create the vocabulary of the corpus, that is list of all the unique words appearing in the sentences

def make_vocabulary(X_train):
    vocabulary = []
    dic = {}

    for sent in X_train:
        for word in sent:
            if word not in dic.keys():
                vocabulary.append(word)
                dic[word] = 1
    return vocabulary

In [10]:
vocabulary = make_vocabulary(pre_processed_data)

In [11]:
len(vocabulary)

33051

In [12]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
int_labels = encoder.fit_transform(labels)

In [14]:
# We make seperate vocabs for 5 different classes

datas = []
vocabs = []
dict_vocabs = []
for i in range(5):
    datas.append([])
for i in range(len(pre_processed_data)):
    datas[int_labels[i]].append(pre_processed_data[i])
for data in datas:
    vocabs.append(make_vocabulary(data))
    
for vocab in vocabs:
    dic = {}
    for word in vocab:
        dic[word] = 500
    dict_vocabs.append(dic)

In [37]:
def ICF_calc(vocabulary , dict_vocabs):
    icf_dict = {}
    for word in vocabulary:
        count = 0
        for dicti in dict_vocabs:
            if word in list(dicti.keys()):
                count += 1
        icf_dict[word] = math.log10(5/count)
    return icf_dict

In [38]:
icf_ = ICF_calc(vocabulary , dict_vocabs)

In [40]:
tf_ = []
for i in range(5):
    data = datas[i]
    dictionary = {}
    for word in vocabulary:
        count = 0
        
        # calculate count
        for sent in data:
            for token in sent:
                if(token == word):
                    count += 1
                    
        # Update Term frequency in this class
        dictionary[word] = count
    tf_.append(dictionary)

In [42]:
final_vectors = []
for i in range(len(pre_processed_data)):
    lab = int_labels[i]
    sentence = pre_processed_data[i]
    
    # Now I will make make a dictionary out of this sentence
    dict_temp = {}
    for word in sentence:
        dict_temp[word] = 1
    
    # Now we have to make a vector of size of length of vocabulary
    vec = [0]*len(vocabulary)
    
    for j in range(len(vocabulary)):
        word = vocabulary[j]
        if (word not in list(dict_temp.keys())):
            continue
        vec[j] = tf_[lab][word] * icf_[word]
    final_vectors.append(vec)

In [45]:
len(final_vectors[10])

33051

In [48]:
X_train ,  X_test , Y_train , Y_test = train_test_split(final_vectors,labels)

In [50]:
gnb = GaussianNB()
gnb.fit(X_train , Y_train)
Y_pred_ = gnb.predict(X_test)
accuracy_ = accuracy_score(Y_test , Y_pred_)

In [51]:
accuracy_

1.0

In [52]:
accuracies_tficf = []
test_sizes = [0.1,0.2,0.3,0.4,0.5]
for test_size in test_sizes:

    X_train ,  X_test , Y_train , Y_test = train_test_split(final_vectors,labels,test_size = test_size)

    # Training a Naive Bayes Algorithm
    gnb = GaussianNB()
    gnb.fit(X_train , Y_train)
    Y_pred_NB = gnb.predict(X_test)
    accuracy_nb = accuracy_score(Y_test , Y_pred_NB)
    accuracies_tficf.append(accuracy_nb)
accuracies_tficf

[1.0, 1.0, 0.9977628635346756, 0.9949664429530202, 0.9946308724832215]

In [None]:
Y_train = Y_train.tolist()

Now we will calculate the probability of each category based on the frequency of documents
in the training set that belong to that category.

In [58]:
len(pre_processed_data)

1490

In [59]:
freq = [0,0,0,0,0]
encoder = LabelEncoder()
int_lab = encoder.fit_transform(Y_train)
for ele in int_lab:
    freq[ele] += 1

In [61]:
prob = []
for fre in freq:
    prob.append(fre/len(Y_train))

In [62]:
prob

[0.22013422818791947,
 0.15570469798657718,
 0.1906040268456376,
 0.2442953020134228,
 0.18926174496644296]

Now we will calculate the probability of each feature given each category based on the TF-ICF values of that feature in documents belonging to that category.

In [63]:
tf_icf_overall = []
for word in vocabulary:
    val = 0
    for tf in tf_:
        val += tf[word]
    val *= icf_[word]
    tf_icf_overall.append(val)

In [66]:
prob_tficf = []
summ = sum(tf_icf_overall)
for val in tf_icf_overall:
    prob_tficf.append(val/summ)

In [68]:
prob_tficf

[0.0006420356934675213,
 2.6205538508878417e-05,
 0.0,
 0.0,
 0.0,
 4.178313692044098e-05,
 0.0,
 0.0,
 0.00010482215403551367,
 0.000563419077940886,
 4.3599795047416676e-05,
 0.00034517523710439463,
 0.0,
 0.0,
 0.0,
 1.3102769254439208e-05,
 0.0,
 5.2411077017756834e-05,
 1.4919427381414901e-05,
 2.906653003161112e-05,
 1.3102769254439208e-05,
 0.0,
 0.0002834691202468831,
 1.6634951185753958e-05,
 0.0,
 2.2379141072122354e-05,
 0.0,
 0.0,
 0.00037298568453537255,
 0.0,
 0.0,
 0.0,
 0.0,
 9.149223152164677e-05,
 0.0,
 0.0,
 1.9983239396732643e-05,
 0.00013102769254439206,
 1.3102769254439208e-05,
 3.088318815858681e-05,
 0.0,
 4.3599795047416676e-05,
 0.0,
 2.2379141072122354e-05,
 0.0,
 9.809953885668753e-05,
 1.3102769254439208e-05,
 0.0,
 3.269984628556251e-05,
 1.4919427381414901e-05,
 0.0,
 2.5433213777659727e-05,
 0.0,
 6.551384627219603e-05,
 0.0,
 0.0,
 5.813306006322224e-05,
 0.0,
 0.0002453655299898709,
 3.7298568453537254e-05,
 0.0002815820096812327,
 0.0,
 4.158737796438

Now we will try different methods to improve our classifier

In [106]:
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [103]:
corpus = []
for sent in pre_processed_data:
    combined = ""
    for word in sent:
        combined += word
        combined += " "
    corpus.append(combined)
corpus

 'german business confidence slide german business confidence fell february knocking hope speedy recovery europe largest economy. munich-based research institute ifo said confidence index fell 95.5 february 97.5 january first decline three months. study found outlook manufacturing retail sector worsened. observer hoping confident business sector would signal economic activity picking up. surprised ifo index taken knock said dz bank economist bernd weidensteiner. main reason probably domestic economy still weak particularly retail trade. economy labour minister wolfgang clement called dip february ifo confidence figure mild decline said despite retreat index remained relatively high level expected modest economic upswing continue. germany economy grew 1.6% last year shrinking 2003. however economy contracted 0.2% last three month 2004 mainly due reluctance consumer spend. latest indication growth still proving elusive ifo president hans-werner sinn said improvement german domestic deman

In [111]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus)
y = data['Category']
X = X.todense()

accuracies_tfidf = []
test_sizes = [0.1,0.2,0.3,0.4,0.5]
for test_size in test_sizes:

    X_train ,  X_test , Y_train , Y_test = train_test_split(X,y,test_size = test_size)

    # Training a Naive Bayes Algorithm
    gnb = GaussianNB()
    gnb.fit(X_train , Y_train)
    Y_pred_NB = gnb.predict(X_test)
    accuracy_nb = accuracy_score(Y_test , Y_pred_NB)
    accuracies_tfidf.append(accuracy_nb)
accuracies_tfidf

[0.9261744966442953,
 0.9060402684563759,
 0.901565995525727,
 0.9161073825503355,
 0.9006711409395973]

In [112]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
y = data['Category']
X = X.todense()

accuracies_ngrams = []
test_sizes = [0.1,0.2,0.3,0.4,0.5]
for test_size in test_sizes:
    X_train ,  X_test , Y_train , Y_test = train_test_split(X,y,test_size=test_size)

    # Training a Naive Bayes Algorithm
    gnb = GaussianNB()
    gnb.fit(X_train , Y_train)
    Y_pred_NB = gnb.predict(X_test)
    accuracy_n = accuracy_score(Y_test , Y_pred_NB)
    accuracies_ngrams.append(accuracy_n)
accuracies_ngrams

[0.9194630872483222,
 0.9530201342281879,
 0.9194630872483222,
 0.9295302013422819,
 0.9181208053691275]

In [99]:
pre_processed_data

[['worldcom',
  'ex-boss',
  'launch',
  'defence',
  'lawyer',
  'defending',
  'former',
  'worldcom',
  'chief',
  'bernie',
  'ebbers',
  'battery',
  'fraud',
  'charge',
  'called',
  'company',
  'whistleblower',
  'first',
  'witness.',
  'cynthia',
  'cooper',
  'worldcom',
  'ex-head',
  'internal',
  'accounting',
  'alerted',
  'director',
  'irregular',
  'accounting',
  'practice',
  'u',
  'telecom',
  'giant',
  '2002.',
  'led',
  'collapse',
  'firm',
  'following',
  'discovery',
  '$11bn',
  '(£5.7bn)',
  'accounting',
  'fraud.',
  'mr',
  'ebbers',
  'pleaded',
  'guilty',
  'charge',
  'fraud',
  'conspiracy.',
  'prosecution',
  'lawyer',
  'argued',
  'mr',
  'ebbers',
  'orchestrated',
  'series',
  'accounting',
  'trick',
  'worldcom',
  'ordering',
  'employee',
  'hide',
  'expense',
  'inflate',
  'revenue',
  'meet',
  'wall',
  'street',
  'earnings',
  'estimates.',
  'm',
  'cooper',
  'run',
  'consulting',
  'business',
  'told',
  'jury',
  'new',


In [69]:
X[0]

<1x24608 sparse matrix of type '<class 'numpy.float64'>'
	with 137 stored elements in Compressed Sparse Row format>

In [70]:
data.describe()

Unnamed: 0,Text,Category
count,1490,1490
unique,1440,5
top,ray dvd beats box office takings oscar-nominat...,sport
freq,2,346


In [71]:
data["Category"].nunique()

5