# Text classification
with numpy, pandas and sklearn

In [None]:
import numpy as np
import pandas as pd 

In [None]:
# pandas reads text/csv/tab-sep files as DataFrames objects
df = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/TextFiles/smsspamcollection.tsv', sep='\t')

In [None]:
df.head()

In [None]:
# check for missing values
# if anything below is 1, then we know we are missing some data
df.isnull().sum()

In [None]:
# lengths of datafram
len(df)

In [None]:
# access columns
df['label'].head()

In [None]:
# access unique values in specific columns
df['label'].unique()

In [None]:
# access value counts
df['label'].value_counts()

In [None]:
df['length'].describe()

In [None]:
# visualise data inline
import matplotlib.pyplot as plt
%matplotlib inline

plt.xscale('log')
bins = 1.15**(np.arange(0,50))
plt.hist(df[df['label']=='ham']['length'], bins=bins, alpha=0.8)
plt.hist(df[df['label']=='spam']['length'], bins=bins, alpha=0.8)
plt.legend(('ham', 'spam'))
plt.show()

In [None]:
df['punct'].describe()

In [None]:
plt.xscale('log')
bins = 1.5**(np.arange(0,15))
plt.hist(df[df['label']=='ham']['punct'], bins=bins, alpha=0.8)
plt.hist(df[df['label']=='spam']['punct'], bins=bins, alpha=0.8)
plt.legend(('ham', 'spam'))
plt.show()

# Split the data

In [None]:
# general form: from sklearn.'model' import 'Model'
from sklearn.model_selection import train_test_split

In [None]:
# select features
# X: feature data
X = df[['length', 'punct']]
# y: label
y = df['label']
    
# (shift-tab to see docstring example)
# returns training and test set of data and labels
# test size: 30% test
# random state: to repeat the same random state, use this parameter
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# training features
X_train.shape

In [None]:
# test features
X_test.shape

In [None]:
X_test.head()

In [None]:
# test labels
y_test.shape

In [None]:
y_test.head()

# Steps of training a model

In [None]:
# import a model
from sklearn.linear_model import LogisticRegression

In [None]:
# create instance of model (edit parameters when needed)
lr_model = LogisticRegression(solver='lbfgs')

In [None]:
# fit the model on your training data (it will return the parameters)
lr_model.fit(X_train,y_train)

# Test accuracy

In [None]:
from sklearn import metrics

In [None]:
predictions = lr_model.predict(X_test)

In [None]:
# predicted labels
predictions

In [None]:
# actual labels
y_test

In [None]:
# build confusion matrix
metrics.confusion_matrix(y_test, predictions)

In [None]:
# assign confusion matrix to data frame and add labels
df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['actual ham','actual spam'], columns=['predicted ham','predicted spam'])
df
# results are not very good :(

In [None]:
# print out classification report of precision, recall & f1-score
print(metrics.classification_report(y_test, predictions))

In [None]:
metrics.accuracy_score(y_test, predictions)

In [None]:
# maybe try other models?

In [None]:
# general format for any model: (import, create instance,) fit, predict, evaluate
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train, y_train)
predictions = nb_model.predict(X_test)
metrics.confusion_matrix(y_test,predictions)

In [None]:
df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions), index=['actual ham','actual spam'], columns=['predicted ham','predicted spam'])
df

In [None]:
print(metrics.classification_report(y_test,predictions))

In [None]:
# let's try another model
from sklearn.svm import SVC

svc_model = SVC(gamma='auto')
svc_model.fit(X_train, y_train)
predictions = svc_model.predict(X_test)
metrics.confusion_matrix(y_test, predictions)

In [None]:
# a bit better 
print(metrics.classification_report(y_test,predictions))

# Feature extraction from text

// From scratch: build a corpus of documents, create a vocabulary from  the texts, extract features by bag-of-words technique //

In [None]:
# load text
df = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/TextFiles/smsspamcollection.tsv', sep='\t')
df.head()

In [None]:
# check for missing values
for el in df.isnull().sum():
    if el > 0:
        print('Oh no, something\'s missing.')

In [None]:
df['label'].value_counts()

In [None]:
# select features
X = df['message']
y = df['label']

# split train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# count vectoriser includes text preprocessing, tokenising, stop-word filtering
# it builds a dictionary of features and transforms document to feature vectors
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

In [None]:
# still in raw text form
X

In [None]:
# fit vectoriser to data: build vocab, count nr of words, ...
# & transform original text to vector
# in two steps:
#count_vect.fit(X_train)
#X_train_counts = count_vect.transform(X_train)

# or in one step:
X_train_counts = count_vect.fit_transform(X_train)

In [None]:
# it's a sparse matrix now!
X_train_counts

In [None]:
X_train.shape

In [None]:
X_train_counts.shape

Next, transform counts to frequencies with tfidf:

     tf-idf(t,d,D) = tf(t,d) * 1/df(t,D)

     tfidf = tf*idf

In [None]:
# transform counts to frequencies
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
# same shape, but no longer just counts, but tfidf frequencies
X_train_tfidf.shape

!!! 

TfidfVectorizer combines the steps of
CountVectorizer and TfidfTransformer

!!!

In [None]:
# combines count vectorisation and tfidf transformation in one step
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)

In [None]:
# train a classifier
from sklearn.svm import LinearSVC

clf = LinearSVC()

clf.fit(X_train_tfidf, y_train)

# OR:

This would have to be done with the test data as well. Instead of repeating the process...

Create a pipeline that combines the vectorization and classification

In [None]:
from sklearn.pipeline import Pipeline

# list of tuples: [('name', class), ... ] 
text_clf = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# you can add several steps to a pipeline, e.g. tokenisation, feature extraction, lemmatisation

In [None]:
# pass raw text and labels as arguments
# (run it to see steps)
text_clf.fit(X_train, y_train)

In [None]:
# pass raw text as argument
predictions = text_clf.predict(X_test)

In [None]:
# check metrics
from sklearn.metrics import confusion_matrix, classification_report

print('Confusion matrix:')
print(confusion_matrix(y_test,predictions))
print('\n')
print('Classification report:')
print(classification_report(y_test,predictions))

In [None]:
# accuracy
metrics.accuracy_score(y_test, predictions)

In [None]:
# predict on a new message
text_clf.predict(['Hi, how are you today?'])

In [None]:
text_clf.predict(['You have been selected as a winner to win the great prize of 5 million dollars, just open the link below'])