# Text Classification example 

Vamos a clasificar el texto contenido en el dataset tomado de kaggle (https://www.kaggle.com/uciml/sms-spam-collection-dataset/downloads/spam.csv/1) y que se encuentra en la carpeta data.

In [0]:
import pandas as pd 
import numpy as np

In [5]:
df = pd.read_csv("spam.csv", encoding="latin-1")
df.head(4)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,


In [6]:
df = df[["v1", "v2"]]
df.head(4)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...


In [7]:
df["v1"].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
df["preprocessed"] = df["v2"].apply(lambda x: x.split(" "))

In [11]:
df.head(4)

Unnamed: 0,v1,v2,preprocessed
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point,, crazy.., Available..."
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar..., Joking, wif, u, oni...]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor..., U, c, already..."


In [0]:
bow_extractor = CountVectorizer(strip_accents='ascii', stop_words="english")

In [0]:
X = bow_extractor.fit_transform(df["v2"])

In [20]:
X = X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [0]:
df["labels"] = df["v1"].apply(lambda x: 0 if x=="ham" else 1)

In [22]:
df.head(4)

Unnamed: 0,v1,v2,preprocessed,labels
0,ham,"Go until jurong point, crazy.. Available only ...","[Go, until, jurong, point,, crazy.., Available...",0
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar..., Joking, wif, u, oni...]",0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[Free, entry, in, 2, a, wkly, comp, to, win, F...",1
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, so, early, hor..., U, c, already...",0


In [0]:
y = df["labels"].values

In [0]:
from sklearn.naive_bayes import GaussianNB

In [0]:
clf = GaussianNB()

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=12, stratify=y)

In [30]:
clf.fit(X_tr.toarray(), y_tr)

GaussianNB(priors=None, var_smoothing=1e-09)

In [31]:
clf.score(X_te.toarray(), y_te)

0.9019138755980861

In [0]:
from sklearn.metrics import classification_report, confusion_matrix

In [0]:
y_pred = clf.predict(X_te.toarray())

In [34]:
print(classification_report(y_te, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94      1448
           1       0.58      0.94      0.72       224

    accuracy                           0.90      1672
   macro avg       0.79      0.92      0.83      1672
weighted avg       0.94      0.90      0.91      1672



In [35]:
confusion_matrix(y_te, y_pred)

array([[1297,  151],
       [  13,  211]])

In [0]:
clf_2 = GaussianNB(priors=np.array([0.2, 0.8]))

In [42]:
clf_2.fit(X_tr.toarray(), y_tr)

GaussianNB(priors=array([0.2, 0.8]), var_smoothing=1e-09)

In [43]:
clf_2.score(X_te.toarray(), y_te)

0.9019138755980861

In [0]:
y_pred = clf_2.predict(X_te.toarray())

In [45]:
print(classification_report(y_te, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.90      0.94      1448
           1       0.58      0.94      0.72       224

    accuracy                           0.90      1672
   macro avg       0.79      0.92      0.83      1672
weighted avg       0.94      0.90      0.91      1672



In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
clf_3 = RandomForestClassifier()

In [49]:
clf_3.fit(X_tr.toarray(), y_tr)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [50]:
clf_3.score(X_te.toarray(), y_te)

0.9677033492822966

In [0]:
y_pred = clf_3.predict(X_te)

In [52]:
print(classification_report(y_te, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1448
           1       0.99      0.76      0.86       224

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.97      1672

