# Sentiment Analysis con KNN
## Clasificador en C++ 💪💪
Vamos a probar a nuestro bichito


Definir los path al ejecutable de python 3.6 y sus librerías,
de acuerdo al virtual env que estén corriendo.

In [None]:
!cd .. && git submodule init
!cd .. && git submodule update
!cd .. && mkdir build
!cd ../build/ && rm -rf *
!cd ../build && cmake \
  -DPYTHON_EXECUTABLE="$(which python)" \
  -DCMAKE_BUILD_TYPE=Release ..
!cd ../build && make install

In [None]:
# Verifico la correcta instalación. Si no falla el import está OK
!pwd
!python --version
import sentiment


In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd

!cd ../data && tar -xvf *.tgz

df = pd.read_csv("../data/imdb_small.csv", index_col=0)

print("Cantidad de documentos: {}".format(df.shape[0]))

In [None]:
df.describe()

In [None]:
text_train = df[df.type == 'train']["review"]
label_train = df[df.type == 'train']["label"]

text_test = df[df.type == 'test']["review"]
label_test = df[df.type == 'test']["label"]

#descomentar esto si se quiere tener un dataset más chico que los 6.000 totales
#text_train = text_train[:1000]
#label_train = label_train[:1000]

#text_test = text_test[:1000]
#label_test = label_test[:1000]

print("Cantidad de instancias de entrenamiento = {}".format(len(text_train)))
print("Cantidad de instancias de test = {}".format(len(text_test)))


In [None]:
from __future__ import division
print("Class balance : {} pos {} neg".format(
    (label_train == 'pos').sum() / label_train.shape[0], 
    (label_train == 'neg').sum() / label_train.shape[0]
))


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.90, min_df=0.01, max_features=5000)
#max df cambia muy poco, hay muy pocas palabras que aparecen más de 80% de los comentarios por ej
#incluso más que en el 20% de los comentarios no hay muchas, igual tiene sentido hacerlo quizas? para sacar molestos.
#Seguro no quiero un max_df bajo como 0.20 porque me saca las mejores palabras para comparar..
#Probar si poner max_df=0.1 por ej arruina PCA

#min_df=0.003 es el momento clave que max_features se acerca a 5000. O sea que hay como 4800 palabras que aparecen en 0.3% comentarios
#ya con min_df=0.01% tenemos como 43.000 palabras. Todas las palabras aparecen en al menos 0.01% de comments?? es raro eso
#Duh.. 0.01% ya es menos de 1 comentario.. Podría pensar que el máximo razonable para probar de min_df es 0.2%, que es aparecer
#en 12 comentarios aprox, que podemos pensar empieza a ser suficiente para sacar conclusiones.
#Eso ya te baja el Nº palabras a 6500, que es bastante cerca del máximo que queríamos tomar de 5000. Experimentar por ahí

#OJO, count vectorizer solo nos deja 43.000 palabras, que son todas las que hay en text_train, y al text_test lo reduce
#a esas palabras!!! o sea que ni cuenta las palabras que hay en los comentarios de test que no estaban en el train

#para los valores que venía por defecto de la catedra (0.9  0.01):
#segun esto, los comentarios de train quedan en promedio con 102 palabras, y los de test con 98, no es terrible, pero claramente
#les sacamos varias palabras a los de train... igualmente seguro eran las menos frecuentes, porque nunca aparecian en train

vectorizer.fit(text_train)

X_train, y_train = vectorizer.transform(text_train), (label_train == 'pos').values
X_test, y_test = vectorizer.transform(text_test), (label_test == 'pos').values

#esto te deja ver cuantas palabras quedaron despues del CountVectorizer
print(X_train.shape)
print(X_test.shape)

In [None]:
import sentiment
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import calendar;
import time;

def f1score(prec, rec):
    f1 = 2*(prec*rec)/(prec+rec)
    return f1

def getMetrics(predictions, actualValues):
    tp = np.sum( np.logical_and(predictions == 1, actualValues == 1) )
    tn = np.sum( np.logical_and(predictions == 0, actualValues == 0) )
    fp = np.sum( np.logical_and(predictions == 1, actualValues == 0) )
    fn = np.sum( np.logical_and(predictions == 0, actualValues == 1) )

    acc = (tp + tn) / (tp+tn+fp+fn)
    if (tp + fp) == 0:
        #esto es que nunca predije que algo era positivo, o sea que nunca le pifié
        prec = 1
    else:
        prec = tp / (tp + fp)
    if (tp + fn) == 0:
        #esto es que no había positivos reales, o sea que "los agarré a todos"
        rec = 1
    else:
        rec = tp / (tp + fn)
    return acc, prec, rec

def saveData(filename, data):
    ts = calendar.timegm(time.gmtime())
    np.savetxt("{}_{}.csv".format(ts, filename), data, delimiter=",")
    
print("Done! You're a good boy 🐶")

In [None]:
%%time

# Esta celda puede ser ejecutada sólo cuando se quiere probar PCA
pca = sentiment.PCA(50)
pca.fit(X_train)

In [None]:

# Idem anterior!

pca_x_train = pca.transform(X_train)
pca_x_test = pca.transform(X_test)

In [None]:
%%time

# Descomentar la implementación deseada:
# Uncomment this for KNN only (PCA disabled):
# clf = sentiment.KNNClassifier(100)
# clf.fit(X_train, y_train)

# y_pred = clf.predict(X_test[:500])
# acc = accuracy_score(y_test[:500], y_pred[:500])

# Uncomment this for PCA enabled:
clf = sentiment.KNNClassifier(550)
clf.fit(pca_x_train, y_train)

y_pred = clf.predict(pca_x_test)

acc, prec, rec = getMetrics(y_pred, y_test)
f1 = f1score(prec, rec)

print(acc, prec, rec, f1)

In [None]:
## ==================== Experimento 1 ========================
## Análisis de calidad de resultados en base a k y alpha

## Analizo la variación de las métricas en funcion de k

K_vals = np.arange(0,500,25)
K_vals[0] = 1
mets = np.zeros((len(K_vals), 3))
               
for i,a in enumerate(K_vals):
    pca = sentiment.PCA(a)
    pca.fit(X_train[:1000])
    Xtc_train = pca.transform(X_train[:1000])
    Xtc_test = pca.transform(X_test[:1000])
    
    clf = sentiment.KNNClassifier(25)
    clf.fit(Xtc_train, y_train[:1000])
    preds = clf.predict(Xtc_test)
    acc, prec, rec = getMetrics(preds[:1000], y_test[:1000])
    f1 = f1score(prec, rec)
    mets[i] = [acc, prec, rec, f1]
    print("Finished {}".format(a))
    
plt.plot(K_vals,mets[:,0], 'b.-')
plt.plot(K_vals,mets[:,1], 'r.-')
plt.plot(K_vals,mets[:,2], 'g.-')
plt.plot(K_vals,mets[:,3], 'k.-')
plt.title("Metricas en funcion de Alpha")
plt.xlabel("K")
plt.ylabel("%")
plt.axis([1, K_vals[-1], 0.4, 1])
plt.gca().legend(('Acc','Prec','Recall','F1'))

In [None]:
## Analizo la variación de las métricas en funcion de alpha para K fijo

alphas = np.arange(0,500,50)
alphas[0] = 1
K_vals_fixes = np.array([50,550,1100])

dmets = { i : np.zeros((len(alphas), 3)) for i in K_vals_fixes}
dmets

In [None]:
print("Beginning alpha metric calculations")

for i,a in enumerate(alphas):
    
    pca = sentiment.PCA(a)
    pca.fit(X_train)
    Xtc_train = pca.transform(X_train)
    Xtc_test = pca.transform(X_test)
    print("Finished training PCA {}".format(a))

    for j, k in enumerate(K_vals_fixes):
        clf = sentiment.KNNClassifier(k)
        clf.fit(Xtc_train, y_train)
        preds = clf.predict(Xtc_test)
        acc, prec, rec = getMetrics(preds, y_test)
        f1 = f1score(prec,rec)
        mets = dmets[k]
        mets[i] = [acc, prec, rec, f1]
        print("Finished {}, alpha = {}".format(k, a))


## Metricas para PCA:

line_styles = ['b.-', 'r.-', 'g.-', 'k.-', 'y.-', 'm.-']
for j,k in enumerate(K_vals_fixes):
    mets = dmets[k]

    plt.subplot(4, 1, 1)
    plt.plot(alphas,mets[:,0], line_styles[j])
    plt.ylabel("Acc")
    plt.xticks([])

    plt.subplot(4, 1, 2)
    plt.plot(alphas,mets[:,1], line_styles[j])
    plt.ylabel("Prec")
    plt.xticks([])
    
    plt.subplot(4, 1, 3)
    plt.plot(alphas,mets[:,2], line_styles[j])
    plt.ylabel("Rec")
    plt.xticks([])
    
    plt.subplot(4, 1, 4)
    plt.plot(alphas,mets[:,2], line_styles[j])
    plt.ylabel("F1")
    plt.xticks([])

    
plt.xlabel("alpha")
plt.xticks(alphas)
plt.axis([1, alphas[-1], 0.3, 0.8])
plt.gca().legend([str(i) for i in K_vals_fixes])

In [None]:
a = np.random.random((1200, 500))

# generate 2 2d grids for the x & y bounds
y, x = np.meshgrid(np.linspace(-1, 1200), np.linspace(1, 500))

z = (1 - x / 2. + x ** 5 + y ** 3) * np.exp(-x ** 2 - y ** 2)
# x and y are bounds, so z should be the value *inside* those bounds.
# Therefore, remove the last value from the z array.
z_min, z_max = -np.abs(z).max(), np.abs(z).max()

fig, ax = plt.subplots()

c = ax.pcolormesh(x, y, z, cmap='RdBu', vmin=z_min, vmax=z_max)
ax.set_title('pcolormesh')
# set the limits of the plot to the limits of the data
ax.axis([x.min(), x.max(), y.min(), y.max()])
fig.colorbar(c, ax=ax)

# plt.show()

# plt.imshow(a, cmap='hot', interpolation='nearest')
# plt.show()
dmets
y

In [None]:
for i in range(15, 0, -1): # <--Delete me
    print(i)
    pca = sentiment.PCA(min(i, 50))
    pca.fit(X_train[:i])
    Xtc_train = pca.transform(X_train[:i])
    Ytc_train = y_train[:i]
    Xtc_test = pca.transform(X_test)
    
    for k in range(min(i, 50), 0, -1): # <--Delete me
        clf = sentiment.KNNClassifier(k)

In [None]:
trainSizeRange = range(X_train.shape[0], 0, -120)
trainSizeRangeLength = len(trainSizeRange)
knnRange = range(2500, 0, -50)
knnRangeLength = len(knnRange)

print("Will have {} trainSize instances".format(trainSizeRangeLength))
print("Will have {} knn instances".format(knnRangeLength))

accColorMap = np.zeros(shape=(trainSizeRangeLength, len(knnRange)))
precColorMap = np.zeros(shape=(trainSizeRangeLength, len(knnRange)))
recColorMap = np.zeros(shape=(trainSizeRangeLength, len(knnRange)))

for i in range(0, len(trainSizeRange)):
    trainSize = trainSizeRange[i]
    pca = sentiment.PCA(min(i, 50))
    pca.fit(X_train[:trainSize])
    Xtc_train = pca.transform(X_train[:trainSize])
    Ytc_train = y_train[:trainSize]
    Xtc_test = pca.transform(X_test)
    print("Finished training PCA train_size = {}".format(trainSize))

    for j in range(0, knnRangeLength):
        k = knnRange[j]
        if (k <= trainSize):
            clf = sentiment.KNNClassifier(k)
            clf.fit(Xtc_train, Ytc_train)
            preds = clf.predict(Xtc_test)
            acc, prec, rec = getMetrics(preds, y_test)
            f1 = f1score(prec,rec)
            accColorMap[trainSizeRangeLength - i - 1, knnRangeLength - j - 1] = acc
            precColorMap[trainSizeRangeLength - i - 1, knnRangeLength - j - 1] = prec
            recColorMap[trainSizeRangeLength - i - 1, knnRangeLength - j - 1] = rec
            f1ColorMap[trainSizeRangeLength - i - 1, knnRangeLength - j - 1] = f1
            print("Finished k = {}, train_size = {}".format(k, trainSize))
        else:
            accColorMap[trainSizeRangeLength - i - 1, knnRangeLength - j - 1] = 0
            precColorMap[trainSizeRangeLength - i - 1, knnRangeLength - j - 1] = 0
            recColorMap[trainSizeRangeLength - i - 1, knnRangeLength - j - 1] = 0
            f1ColorMap[trainSizeRangeLength - i - 1, knnRangeLength - j - 1] = 0

saveData("accColorMap", accColorMap)
saveData("precColorMap", precColorMap)
saveData("recColorMap", recColorMap)
saveData("f1ColorMap", f1ColorMap)
        
fig, ax = plt.subplots()

c = ax.pcolormesh(accColorMap, cmap='hot')
ax.set_title('Train_size vs neighbors')
# set the limits of the plot to the limits of the data
ax.axis([0, accColorMap.shape[1], 0, accColorMap.shape[0]])
fig.colorbar(c, ax=ax)

fig2, ax2 = plt.subplots()

c2 = ax2.pcolormesh(precColorMap, cmap='hot')
ax2.set_title('pcolormesh')
# set the limits of the plot to the limits of the data
ax2.axis([0, precColorMap.shape[1], 0, precColorMap.shape[0]])
fig2.colorbar(c2, ax=ax2)

fig3, ax3 = plt.subplots()

c3 = ax3.pcolormesh(recColorMap, cmap='hot')
ax3.set_title('pcolormesh')
# set the limits of the plot to the limits of the data
ax3.axis([0, recColorMap.shape[1], 0, recColorMap.shape[0]])
fig3.colorbar(c3, ax=ax3)

fig4, ax4 = plt.subplots()

c4 = ax4.pcolormesh(f1ColorMap, cmap='hot')
ax4.set_title('pcolormesh')
# set the limits of the plot to the limits of the data
ax4.axis([0, f1ColorMap.shape[1], 0, f1ColorMap.shape[0]])
fig4.colorbar(c4, ax=ax4)

plt.show()

In [None]:
fig, ax = plt.subplots()

c = ax.pcolormesh(accColorMap, cmap='hot', vmin=0, vmax=1)
ax.set_title('Train_size vs neighbors (accuracy)')
# set the limits of the plot to the limits of the data
ax.axis([0, accColorMap.shape[1], 0, accColorMap.shape[0]])
ax.set_xlabel("Train size (1:125)")
ax.set_ylabel("K neighbors (1:50)")
fig.colorbar(c, ax=ax)

fig2, ax2 = plt.subplots()

c2 = ax2.pcolormesh(precColorMap, cmap='hot', vmin=0, vmax=1)
ax2.set_title('Train_size vs neighbors (precision)')
# set the limits of the plot to the limits of the data
ax2.axis([0, precColorMap.shape[1], 0, precColorMap.shape[0]])
ax2.set_xlabel("Train size (1:125)")
ax2.set_ylabel("K neighbors (1:50)")
fig2.colorbar(c2, ax=ax2)

fig3, ax3 = plt.subplots()

c3 = ax3.pcolormesh(recColorMap, cmap='hot', vmin=0, vmax=1)
ax3.set_title('Train_size vs neighbors (recall)')
# set the limits of the plot to the limits of the data
ax3.axis([0, recColorMap.shape[1], 0, recColorMap.shape[0]])
ax3.set_xlabel("Train size (1:125)")
ax3.set_ylabel("K neighbors (1:50)")
fig3.colorbar(c3, ax=ax3)

fig3, ax3 = plt.subplots()

c4 = ax4.pcolormesh(f1ColorMap, cmap='hot', vmin=0, vmax=1)
ax4.set_title('Train_size vs neighbors (f1)')
# set the limits of the plot to the limits of the data
ax4.axis([0, f1ColorMap.shape[1], 0, f1ColorMap.shape[0]])
ax4.set_xlabel("Train size (1:125)")
ax4.set_ylabel("K neighbors (1:50)")
fig4.colorbar(c4, ax=ax4)

plt.show()