<a href="https://colab.research.google.com/github/antonismaitis/textMining/blob/master/Assignment1_Part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import glob
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn import model_selection,metrics
from sklearn.naive_bayes import MultinomialNB,BernoulliNB,GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.pipeline import make_pipeline 
from nltk.stem import PorterStemmer,WordNetLemmatizer



#Create  Stemmer and Lemmatizer
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()


#NLTK stopwords
stopwords = set(stopwords.words("english"))
#after splitting every stopword in a new line manually,read as a txt and add to set stopwords1
stopwordsstring = open('SentenceCorpus/word_lists/stopwords.txt',"r")

stopwords1 = []
for word in stopwordsstring:
    if word.endswith('\n'):
        word = word[:-1]
        stopwords1.append(word)
#Stopwords1 list had 2 elements '', so i removed them    
stopwords1.remove('')
stopwords1.remove('')

#Union of given stopwords set with nltk stopwords
fstopwords = stopwords.union(stopwords1)

#print(fstopwords)


textfiles=[]

x = os.listdir('SentenceCorpus/labeled_articles')
for i in range(len(x)):
    textfiles = []
    for file in glob.iglob('SentenceCorpus/labeled_articles'+ '/*.txt'.format(i+1)):
        textfiles.append(file)

labels = []
sentences = []
filtered = []

for file in textfiles:
    g = open(file,"r")
    lines = g.readlines()
    for text in lines:
        if  text.startswith('#'):
            continue
        elif "\t" in text:  #Case where category and sentence are seperated with TAB
            labels.append(text.split("\t")[0])
            sentences.append(text.split("\t")[1].lower())
        else: #Case where category and sentence are seperated with ' ' (space)
            labels.append(text.split(" ")[0])
            sentences.append(text[len(text.split(" ")[0])+1:].lower())

#Stemming and Lemmatization decreased accuracy scores, while removing stopwords increased them about 10%
            
for i in range(len(sentences)):
    if sentences[i].endswith("\n"): #One sentence per line 
        sentences[i] = sentences[i][:-1]
    sentences[i] = " ".join([word for word in word_tokenize(sentences[i]) if word not in fstopwords])
    sentences[i] = re.sub(' +', ' ', sentences[i])
    
x_train,x_test,y_train,y_test = model_selection.train_test_split(sentences,labels,random_state=22,stratify = labels)




In [0]:
#Big alpha - small accuracies / Small alpha - big accuracies

#MultinomialNB - CountVectorizer

alpha = 0.1 # This is the smoothing parameter for Laplace/Lidstone smoothing

model = make_pipeline(CountVectorizer(ngram_range=(2,2), binary=True), MultinomialNB(alpha=alpha))

model.fit(x_train, y_train)

y_predicted = model.predict(x_test)


recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')

print("Recall: %f" % recall)
print("Precision: %f" % precision)
print("F1: %f" % f1)

In [0]:
#MultinomialNB - TFIDFVectorizer

alpha = 0.1

model = make_pipeline(TfidfVectorizer(ngram_range = (3,3), binary = True), MultinomialNB(alpha = alpha))

model.fit(x_train, y_train)

y_predicted = model.predict(x_test)

recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')

print("Recall: %f" % recall)
print("Precision: %f" % precision)
print("F1: %f" % f1)


In [0]:
#BernoulliNB - TFIDFVectorizer

alpha = 0.1

model = make_pipeline(TfidfVectorizer(ngram_range = (1,1), binary = False), BernoulliNB(alpha = alpha))

model.fit(x_train, y_train)

y_predicted = model.predict(x_test)

recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')

print("Recall: %f" % recall)
print("Precision: %f" % precision)
print("F1: %f" % f1)

In [0]:
#BernoulliNB - CountVectorizer

alpha = 0.1

model = make_pipeline(CountVectorizer(ngram_range = (1,1), binary = False), BernoulliNB(alpha = alpha))

model.fit(x_train, y_train)

y_predicted = model.predict(x_test)

recall = metrics.recall_score(y_test,y_predicted,average='macro')
precision = metrics.precision_score(y_test,y_predicted,average='macro')
f1 = metrics.f1_score(y_test,y_predicted,average='macro')

print("Recall: %f" % recall)
print("Precision: %f" % precision)
print("F1: %f" % f1)