# Training Classifier Details

In [1]:
import pandas as pd
import random
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from PIL import Image
from wordcloud import WordCloud, STOPWORDS
random.seed(1)

In [2]:
# load training data
data = pd.read_csv('../7-Data/twitter_data_sentiment_trainer_program_for_spam.csv', sep=";", encoding='utf-8')

data.columns = ['Text','SpamOrHam']
data = data.fillna("spam")
msk_data = np.random.rand(len(data)) < 0.8
train = data[msk_data]
test = data[~msk_data]

# train classifier
print('Training classifier...')
print('Vectorization...')
token_vectorizer = CountVectorizer()
vectorized_data = token_vectorizer.fit_transform(train['Text'], y=train['SpamOrHam'])
print('Vectorization done...')

print('Transformer...')
transformer = TfidfTransformer()
transformed_data = transformer.fit_transform(vectorized_data)
print('Transformation done...')

print('Classification...')
naive_bayes_classifier = MultinomialNB()
classified_data = naive_bayes_classifier.fit(transformed_data, y=train['SpamOrHam'])
print('Classification done...')
print('Training done...')

print('Testing...')
print('Vectorization...')
test_vectorized_data = token_vectorizer.transform(test['Text'])
print('Vectorization done...')

print('Transformer...')
test_transformed_data = transformer.transform(test_vectorized_data)
print('Transformation done...')


print(classified_data.score(test_transformed_data, test['SpamOrHam']))

Training classifier...
Vectorization...
Vectorization done...
Transformer...
Transformation done...
Classification...
Classification done...
Training done...
Testing...
Vectorization...
Vectorization done...
Transformer...
Transformation done...
0.925383420209
