# CLASIFICACION DE TEXTO
** En este notebook se realizarán algunas pruebas de clasificación de texto utilizando el dataset de Spooky Authors**

Los modelos utilizados serán:

   ** -Logistic Regression**
   
   ** -Naive Bayes**
   
   ** -Singular Value Decomposition**

In [22]:
import base64
import numpy as np
import pandas as pd
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import Counter
from scipy.misc import imread
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline
#import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize
import gensim 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [23]:
#Agrego lemmatization
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()
class LemmaTFidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaTFidfVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))


**Implementamos una métrica llamada logloss multiclase para evaluar las predicciones de los modelos a utilizar** 

In [24]:
#Defino una version de la metrica de Logarithmic Loss, que funcione para múltiples clases

def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [25]:
# Cargo datos 
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample = pd.read_csv('sample_submission.csv')



In [26]:
# Label encoding de los valores de autor
lbl_enc = preprocessing.LabelEncoder()

print(train.author.values[:10])

y = lbl_enc.fit_transform(train.author.values)

print(y[:10])

['EAP' 'HPL' 'EAP' 'MWS' 'HPL' 'MWS' 'EAP' 'EAP' 'EAP' 'MWS']
[0 1 0 2 1 2 0 0 0 2]


In [27]:
#Separacion de los datos en entrenamiento y validacion
# Stratify implica que la proporcion de valores que va a haber al dividir los datos, va a ser igual 
#Acá le paso y, asi que si hay 30% valor 0, 20 %valor 1 y 50% valor 2, la misma proporcion va a estar en los datos de entr y test
#Random state es la seed para el random number generator
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)
print(xtrain[:2])
print(ytrain[:2])

['Her hair was the brightest living gold, and despite the poverty of her clothing, seemed to set a crown of distinction on her head.'
 '"No," he said, "oh, no a member of my family my niece, and a most accomplished woman."']
[2 0]


In [28]:
print ("Datos de entrenamiento:",len(xtrain))
print ("\nDatos de validación:",len(xvalid))


Datos de entrenamiento: 17621

Datos de validación: 1958


In [29]:
#TF-IDF

#
#tfvec = TfidfVectorizer(min_df=5,  max_features=None, 
#            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
#            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
#            stop_words = 'english')

tf_vectorizer = TfidfVectorizer(max_df=0.85, 
                                     min_df=5,
                                     stop_words='english',
                                     decode_error='ignore')

# Fitting TF-IDF to both training and test sets
tf_vectorizer.fit(list(xtrain) + list(xvalid)) 
#tf_vectorizer.fit(list(xtrain))
xtrain_tfvec =  tf_vectorizer.transform(xtrain) 
xvalid_tfvec = tf_vectorizer.transform(xvalid)
print(xtrain_tfvec.shape)
print("\n 17621 frases y vocabulario de 8881 palabras unicas \n")
print(xtrain_tfvec[0])


(17621, 8089)

 17621 frases y vocabulario de 8881 palabras unicas 

  (0, 6341)	0.2469424385443994
  (0, 5395)	0.31534173991151543
  (0, 4206)	0.2621357570531394
  (0, 3359)	0.21913481148346806
  (0, 3286)	0.2699803359499066
  (0, 3163)	0.28210670387497144
  (0, 2094)	0.33928338965309934
  (0, 1927)	0.2913532806410929
  (0, 1657)	0.3345950093082599
  (0, 1267)	0.3474674493892856
  (0, 913)	0.3710563608683541


In [34]:
# Logistic Regression on TFIDF
clf1 = LogisticRegression(C=1.0)
clf1.fit(xtrain_tfvec, ytrain)
predictions = clf1.predict_proba(xvalid_tfvec)
print("Accuracy: %0.3f" %clf1.score(xvalid_tfvec,yvalid)) #Accuracy 
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))





Accuracy: 0.791
logloss: 0.622 


In [35]:
#Count Vectorizer
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)


In [36]:
# Fitting a simple Logistic Regression on Counts
clf2 = LogisticRegression(C=1.0)
clf2.fit(xtrain_ctv, ytrain)
#print(xtrain_ctv)
#yvalid
print("Accuracy: %0.3f" %clf2.score(xvalid_ctv,yvalid)) #Accuracy 
predictions = clf2.predict_proba(xvalid_ctv)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))


Accuracy: 0.781
logloss: 0.528 


In [39]:
# simple Naive Bayes on Counts
clf3 = MultinomialNB()
clf3.fit(xtrain_ctv, ytrain)
print("score: %0.3f"% clf3.score(xvalid_ctv,yvalid))
predictions = clf3.predict_proba(xvalid_ctv)
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

score: 0.832
logloss: 0.485 


In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfvec)
xtrain_svd = svd.transform(xtrain_tfvec)
xvalid_svd = svd.transform(xvalid_tfvec)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [2]:
# Fitting a simple SVM
clf4 = SVC(C=1.0, probability=True) # since we need probabilities
clf4.fit(xtrain_svd_scl, ytrain)
predictions = clf4.predict_proba(xvalid_svd_scl)
#print("score: %0.3f"% clf4.score(xvalid_ctv,yvalid))
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

NameError: name 'SVC' is not defined