# Text Classification
**Suárez Pérez Juan Pablo**

In [1]:
import numpy as np
import random
import re
import nltk
from tabulate import tabulate
from bs4 import BeautifulSoup as bs
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn import metrics

In [2]:
def extract_lines(corpus):
    X = list()
    y = list()
    for i in range(2, 4381):
        try:
            fname = corpus + str(i) + '.xml'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                content = "".join(content)
                bs_content = bs(content, "lxml")
                review = bs_content.find("review")
                rank = review.get("rank")
                y.append(int(rank))
            
            fname = corpus + str(i) + '.review.pos'
            with open(fname, 'r', encoding = 'latin-1') as rfile:
                content = rfile.readlines()
                review_pos_aux = list()
                              
                for line in content:
                    if line != '\n':
                        line_ls = line.split()
                        review_pos_aux.append(line_ls[1])
                X.append(' '.join(review_pos_aux))
            
        except IOError:
            #print("Could not read file:", fname)
            pass
    
    return X, y

In [3]:
def tokenize_lines_by_words(lines):
    new_lines = list()
    for line in lines:
        new_line = line.lower()
        new_lines.append(nltk.word_tokenize(new_line))
    
    return new_lines

In [4]:
def clean_alphabetic_text_lines(lines):
    new_lines = list()
    for line in lines:
        new_line = list()
        for word in line:
            token = list()
            for c in word:
                #[a-záéíóúñü+$]
                if re.match(r'^[a-záéíóúñü+$]', c):
                    token.append(c)
            token = ''.join(token)
            if token != '':
                new_line.append(token)
        new_lines.append(new_line)
        
    return new_lines

In [5]:
def remove_stop_words(lines):
    stopwords = nltk.corpus.stopwords.words('spanish')
    clean_lines = list()
    for line in lines:
        clean_line = list()
        for word in line:
            if word not in stopwords:
                clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    
    return clean_lines

In [6]:
def get_X_y(lines):
    X = list()
    y = list()
    for line in lines:
        n = len(line)
        tag = line.pop(n - 1)
        corpus = line
        X.append(corpus)
        y.append(tag)
    return [X, y]

In [7]:
def transform_tag(y):
    new_y = list()
    for i in y:
        if i == 'spam':
            new_i = 1
        else:
            new_i = 0
        new_y.append(new_i)
    return np.array(new_y)

In [8]:
X, y = extract_lines('./../corpusCriticasCine-20221215T005130Z-001/corpusCriticasCine/corpusCriticasCine/')

In [9]:
tokenized_X = tokenize_lines_by_words(X)

In [10]:
new_X = clean_alphabetic_text_lines(tokenized_X)

In [11]:
clean_X = remove_stop_words(new_X)

In [12]:
data = list(zip(clean_X, y))

In [13]:
random.shuffle(data)

In [14]:
clean_X, y = zip(*data)

In [15]:
count_vect = CountVectorizer()

In [16]:
X_counts = count_vect.fit_transform(np.array(clean_X))

In [17]:
y = np.array(y)

In [18]:
tfidf_transformer = TfidfTransformer()

In [19]:
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.2, random_state = 14)

# LOGISTIC REGRESSION 

In [21]:
classifier = LogisticRegression()

In [22]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [23]:
y_pred = classifier.predict(X_test)

In [24]:
classifier.score(X_test, y_test)

0.4329896907216495

In [25]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  3  41  17   0   0]
 [  0  72  98   4   0]
 [  0  35 200  23   0]
 [  0  12 114  53   5]
 [  0   3  51  37   8]]


In [26]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      0.05      0.09        61
           2       0.44      0.41      0.43       174
           3       0.42      0.78      0.54       258
           4       0.45      0.29      0.35       184
           5       0.62      0.08      0.14        99

    accuracy                           0.43       776
   macro avg       0.59      0.32      0.31       776
weighted avg       0.50      0.43      0.39       776



# MULTINOMIAL NB

In [27]:
classifier = MultinomialNB()

In [28]:
classifier.fit(X_train, y_train)

MultinomialNB()

In [29]:
y_pred = classifier.predict(X_test)

In [30]:
classifier.score(X_test, y_test)

0.3324742268041237

In [31]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  0   0  61   0   0]
 [  0   0 174   0   0]
 [  0   0 258   0   0]
 [  0   0 184   0   0]
 [  0   0  99   0   0]]


In [32]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00        61
           2       0.00      0.00      0.00       174
           3       0.33      1.00      0.50       258
           4       0.00      0.00      0.00       184
           5       0.00      0.00      0.00        99

    accuracy                           0.33       776
   macro avg       0.07      0.20      0.10       776
weighted avg       0.11      0.33      0.17       776



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# K-MEANS

In [34]:
classifier = KMeans(n_clusters = 5)

In [35]:
classifier.fit(X_tfidf)

KMeans(n_clusters=5)

In [36]:
y_pred = classifier.predict(X_test)

In [37]:
y_pred

array([2, 1, 1, 1, 1, 0, 4, 1, 1, 1, 4, 0, 0, 4, 4, 0, 0, 0, 0, 4, 2, 4,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 4, 2, 0, 3, 4, 0, 0, 1, 1, 0,
       4, 1, 3, 1, 0, 4, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 0, 1, 4, 2, 0, 1,
       0, 0, 0, 4, 4, 0, 1, 2, 4, 4, 0, 3, 1, 1, 3, 4, 1, 4, 0, 1, 3, 3,
       0, 4, 1, 4, 1, 1, 0, 1, 0, 2, 1, 1, 1, 1, 0, 1, 0, 4, 4, 2, 4, 1,
       0, 0, 1, 0, 1, 0, 4, 1, 0, 3, 0, 0, 0, 1, 0, 4, 0, 2, 4, 0, 0, 1,
       1, 0, 0, 4, 0, 0, 3, 1, 3, 0, 0, 4, 0, 4, 0, 4, 4, 0, 2, 1, 4, 1,
       1, 1, 0, 4, 0, 0, 1, 4, 1, 0, 1, 0, 0, 4, 0, 2, 3, 1, 0, 1, 0, 1,
       4, 4, 1, 0, 1, 0, 0, 4, 0, 3, 0, 1, 0, 0, 0, 4, 1, 2, 1, 0, 3, 4,
       1, 1, 0, 0, 0, 4, 1, 3, 0, 1, 2, 2, 1, 1, 1, 0, 1, 0, 0, 0, 3, 4,
       0, 1, 0, 1, 2, 1, 3, 0, 3, 1, 2, 4, 0, 0, 4, 1, 2, 4, 0, 0, 0, 4,
       0, 1, 0, 1, 0, 4, 1, 4, 0, 1, 3, 0, 3, 0, 3, 0, 1, 0, 4, 1, 4, 0,
       0, 1, 1, 4, 3, 0, 1, 4, 1, 1, 1, 4, 1, 0, 4, 0, 0, 0, 2, 3, 4, 1,
       0, 1, 1, 0, 0, 3, 1, 1, 4, 0, 0, 1, 1, 0, 1,

In [42]:
classifier.score(X_test)

-716.4901633242737

In [39]:
print(metrics.confusion_matrix(y_test, y_pred))

[[  0   0   0   0   0   0]
 [ 15  15   3   5  23   0]
 [ 64  61   7  11  31   0]
 [114  75  14  18  37   0]
 [ 83  54  14   6  27   0]
 [ 47  31   3   5  13   0]]


In [40]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.06      0.25      0.10        61
           2       0.17      0.04      0.07       174
           3       0.40      0.07      0.12       258
           4       0.21      0.15      0.17       184
           5       0.00      0.00      0.00        99

    accuracy                           0.09       776
   macro avg       0.14      0.08      0.08       776
weighted avg       0.23      0.09      0.10       776



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
