In [56]:
import nltk
import pandas as pd
import sklearn
import string
import re
import pickle
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import  accuracy_score
from sklearn.pipeline import Pipeline


In [57]:
training_data = fetch_20newsgroups(subset='train', shuffle=True)
testing_data = fetch_20newsgroups(subset='test', shuffle=True)

In [58]:
training = pd.DataFrame({'Text': training_data.data, 'Label':training_data.target})
testing = pd.DataFrame({'Text': testing_data.data, 'Label':testing_data.target})

In [59]:
def clean_text(text):
    
    rgx = re.compile(f'[{re.escape(string.punctuation)}]' )
    text = rgx.sub('',text)
    text= text.lower()
    text = text.split('\n')
    text = ' '.join([x for x in text])

    return text

In [60]:
training['cleaned_text'] = training['Text'].apply(lambda x: clean_text(x))
testing['cleaned_text'] = testing['Text'].apply(lambda x: clean_text(x))


In [61]:
training.head()

Unnamed: 0,Text,Label,cleaned_text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,from lerxstwamumdedu wheres my thing subject w...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,from guykuocarsonuwashingtonedu guy kuo subjec...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,from twillisececnpurdueedu thomas e willis sub...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,from jgreenamber joe green subject re weitek p...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,from jcmheadcfaharvardedu jonathan mcdowell su...


In [62]:
training = training[['cleaned_text','Label']]
testing = testing[['cleaned_text','Label']]

In [63]:
x_train,y_train = training['cleaned_text'],training['Label']
x_test,y_test = testing['cleaned_text'],testing['Label']

In [64]:
count = CountVectorizer(stop_words='english',ngram_range=(1,2))
tfidf = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
logistic = LogisticRegression()
mnb = MultinomialNB()


In [65]:
# pickle.dump(logistic_count_pipe,open('logistic_count.pickle','wb'))
# # joblib.dump(logistic_count_pipe,'logistic_count.sav')

In [66]:
def accuracy_scoring(model):
    scores = {}
    preds = model.predict(x_test)
    return accuracy_score(preds,y_test)
    


In [67]:
logistic_count_pipe = Pipeline([('vectorizer',count),('classifier',logistic)])
bayes_count_pipe = Pipeline([('vectorizer',count),('classifier',mnb)])
logistic_tfidf_pipe = Pipeline([('vectorizer',tfidf),('classifier',logistic)])
bayes_tfidf_pipe = Pipeline([('vectorizer',tfidf),('classifier',mnb)])    

In [76]:
model_names = ['logistic(1,2)_with_countVect','bayes(1,2)_with_countVect', 'logistic(1,2)_with_TFIDF','bayes(1,2)_with_TFIDF']
models = [logistic_count_pipe,bayes_count_pipe,logistic_tfidf_pipe,bayes_tfidf_pipe]
accuracy_scores={}

for idx,model in enumerate(models):
    model = model.fit(x_train,y_train)
    pickle.dump(model,open(f'{model_names[idx]}.pickle','wb'))
    accuracy_scores[model_names[idx]] = accuracy_scoring(model)
    del model # to negate memory error

    
    
    



In [77]:
accuracy_scores

{'logistic(1,2)_with_countVect': 0.8244822092405736,
 'bayes(1,2)_with_countVect': 0.8323154540626659,
 'logistic(1,2)_with_TFIDF': 0.8347052575677111,
 'bayes(1,2)_with_TFIDF': 0.830323951141795}

In [70]:
# # pickle.dump(bayes_count_pipe,open('bayes_count.pickle','wb'))
# joblib.dump(bayes_count_pipe,'bayes_count.sav')

In [71]:
# # pickle.dump(logistic_tfidf_pipe,open('logistic_tfidf.pickle','wb'))
# joblib.dump(logistic_tfidf_pipe,'logisitic_tfidf.sav')

In [72]:
# # pickle.dump(bayes_tfidf_pipe,open('bayes_tfidf.pickle','wb'))
# joblib.dump(bayes_tfidf_pipe,'bayes_tfidf.sav')

In [73]:
# def compare_classifiers(models,model_names,x_test=x_test,y_test=y_test):
    
#     accuracies = []
#     precisions = []
#     recalls  = []
#     for mods in models:
#         preds = mods.predict(x_test)
#         accuracies.append(accuracy_score(preds,y_test))
#         precisions.append (precision_score(preds, y_test))
#         recalls.append(recall_score(preds,y_test))
#     return pd.DataFrame([accuracies,precisions,recalls],
#                         columns = ['accuracy','precision','recall'],
#                         index=model_names)
        
# compare_classifiers(models,model names)

In [74]:
# stopwords? punctuation? abbreviations?

In [75]:
# # count = CountVectorizer(stop_words='english')
# # tfidf = TfidfVectorizer(stop_words='english')
# # logistic = LogisticRegression()
# # mnb = MultinomialNB()


# ngrams_list =[(1,1)]
# def vect_test(vectorizer,
#               classifier,
#               ngrams_list,
#               x_train=x_train,
#               y_train=y_train,
#               x_test=x_test,
#               y_test=y_test):
#     scores = {}
#     for n in ngrams_list:
#         vect = vectorizer.set_params(stop_words='english',ngram_range=n)
#         pipe = Pipeline([('vectorizer',vect),('classifier',classifier)])
#         mod = pipe.fit(x_train,y_train)
#         preds = mod.predict(x_test)
#         score = accuracy_score(preds,y_test)
#         scores[f'vectorizer{vectorizer}:ngrams{n}']=score
#     return scores
        
# vect_test(CountVectorizer(),LogisticRegression(),ngrams_list)
        