# Text Classification

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

with open("data/A.txt", 'r') as fh: descriptions = fh.read().splitlines()
with open("data/D.txt", 'r') as fh: classifications = fh.read().splitlines()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def customtokenize(str):
    tokens = nltk.word_tokenize(str)
    nostop = list(filter(lambda token: token not in stopwords.words('english'), tokens))
    lemmatized = [lemmatizer.lemmatize(word) for word in nostop ]
    return lemmatized

vectorizer = TfidfVectorizer(tokenizer=customtokenize)
tfidf=vectorizer.fit_transform(descriptions)
print("\nSize of TFIDF matrix : ",tfidf.shape)


Size of TFIDF matrix :  (20, 240)


In [3]:
le = preprocessing.LabelEncoder()
le.fit(classifications)
int_classes = le.transform(classifications)

print("Classes found : ", le.classes_)
print("Classes converted to integers :", int_classes)

xtrain, xtest, ytrain, ytest = train_test_split(tfidf, int_classes,random_state=0)
classifier = MultinomialNB().fit(xtrain, ytrain)

Classes found :  ['Cloud-Computing' 'Data-Science' 'Programming']
Classes converted to integers : [1 2 2 0 1 2 1 2 0 1 1 2 2 0 2 0 0 0 2 2]


In [4]:
print("Testing with Test Data \n----------------------")
predictions = classifier.predict(xtest)
print("Prediction Accuracy : ", metrics.accuracy_score(ytest, predictions))
print("Confusion Matrix : \n", metrics.confusion_matrix(ytest, predictions))

print("\nTesting with Full Corpus \n------------------------")
predictions = classifier.predict(tfidf)
print("Prediction Accuracy : ", metrics.accuracy_score(int_classes, predictions))
print("Confusion Matrix : \n", metrics.confusion_matrix(int_classes, predictions))

Testing with Test Data 
----------------------
Prediction Accuracy :  0.6
Confusion Matrix : 
 [[1 0 0]
 [0 0 1]
 [1 0 2]]

Testing with Full Corpus 
------------------------
Prediction Accuracy :  0.9
Confusion Matrix : 
 [[6 0 0]
 [0 4 1]
 [1 0 8]]
