<a href="https://colab.research.google.com/github/flaviowu/btc-c14-g4/blob/main/notebooks/%5BBTC%5DNLP_%26_Gera%C3%A7%C3%A3o_de_vetores_RN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Geraçao de vetor para rede neural

## Configuração de Ambiente

In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
base_folder = 'drive/MyDrive/BTC-Dados-G4'

Mounted at /content/drive/


In [2]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import scipy.sparse

from sklearn.model_selection import ShuffleSplit, cross_validate, train_test_split

from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

from sklearn.metrics import mean_squared_error

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

## Caregamento dos dados

In [4]:
df = pd.read_csv(f'{base_folder}/train_clean_rev2.csv')
df.head()

Unnamed: 0,name,item_condition_id,brand_name,price,shipping,item_description,date,stock,main_cat,sub_cat_1,sub_cat_2
0,MLB Cincinnati Reds T Shirt Size XL,3,,10.0,1,No description yet,2018-6-18,27,Men,Tops,T-shirts
1,Razer BlackWidow Chroma Keyboard,3,Razer,52.0,0,This keyboard is in great condition and works ...,2018-3-18,15,Electronics,Computers & Tablets,Components & Parts
2,AVA-VIV Blouse,1,Target,10.0,1,Adorable top with a hint of lace and a key hol...,2018-10-25,14,Women,Tops & Blouses,Blouse
3,Leather Horse Statues,1,,35.0,1,New with tags. Leather horses. Retail for [rm]...,2018-2-20,1,Home,Home Décor,Home Décor Accents
4,24K GOLD plated rose,1,,44.0,0,Complete with certificate of authenticity,2018-4-16,13,Women,Jewelry,Necklaces


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1476204 entries, 0 to 1476203
Data columns (total 11 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   name               1476204 non-null  object 
 1   item_condition_id  1476204 non-null  int64  
 2   brand_name         846982 non-null   object 
 3   price              1476204 non-null  float64
 4   shipping           1476204 non-null  int64  
 5   item_description   1476204 non-null  object 
 6   date               1476204 non-null  object 
 7   stock              1476204 non-null  int64  
 8   main_cat           1476204 non-null  object 
 9   sub_cat_1          1476204 non-null  object 
 10  sub_cat_2          1476204 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 123.9+ MB


In [6]:
df.drop(['brand_name'], axis=1, inplace=True)
df.dropna(subset=['item_description'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1476204 entries, 0 to 1476203
Data columns (total 10 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   name               1476204 non-null  object 
 1   item_condition_id  1476204 non-null  int64  
 2   price              1476204 non-null  float64
 3   shipping           1476204 non-null  int64  
 4   item_description   1476204 non-null  object 
 5   date               1476204 non-null  object 
 6   stock              1476204 non-null  int64  
 7   main_cat           1476204 non-null  object 
 8   sub_cat_1          1476204 non-null  object 
 9   sub_cat_2          1476204 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 123.9+ MB


In [7]:
df['name'] = df['name'].apply(lambda x: x.lower().strip())
df['main_cat'] = df['main_cat'].apply(lambda x: x.lower().strip())
df['sub_cat_1'] = df['sub_cat_1'].apply(lambda x: x.lower().strip())
df['sub_cat_2'] = df['sub_cat_2'].apply(lambda x: x.lower().strip())

In [None]:
# df['category_name'] = df['category_name'].apply(lambda x: x.lower().strip())
# df['category_name'] = df['category_name'].str.split('/')

In [8]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
mlb = MultiLabelBinarizer()
# vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
# tfidf_transformer = TfidfTransformer()
tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=50000)

In [35]:
def token_processor(tokens):
    processed_token = []
    for token in tokens:
        token = token.lower()
        token = lemmatizer.lemmatize(token)
        
        if token not in stop_words:
            token = stemmer.stem(token)
            processed_token.append(token)
        
    return processed_token

In [36]:
def tokenizer(texts):
    documents = []
    for t in texts:
        t = re.sub(r'[^\w\s]','', t)
        tokens = token_processor(word_tokenize(t))
        documents.append(' '.join(tokens))
    return documents

In [37]:
def get_vector(text):
    # X_counts = vectorizer.fit_transform(tokenizer(text))
    X_tfidf = tfidf.fit_transform(tokenizer(text))
    return X_tfidf

In [38]:
name = get_vector(df.name)
item_description = get_vector(df.item_description)

In [None]:
# scipy.sparse.save_npz(f'{base_folder}/names.npz', name)
# scipy.sparse.save_npz(f'{base_folder}/item_description.npz', item_description)
# sparse_matrix = scipy.sparse.load_npz('/caminho/nome_arquivo.npz')

In [None]:
# name = scipy.sparse.load_npz(f'{base_folder}/names.npz')
# item_description = scipy.sparse.load_npz(f'{base_folder}/item_description.npz')

In [39]:
ohe = OneHotEncoder()

In [40]:
main_cat = ohe.fit_transform(df.main_cat.values.reshape(1, -1))
sub_cat_1 = ohe.fit_transform(df.sub_cat_1.values.reshape(1, -1))
sub_cat_2 = ohe.fit_transform(df.sub_cat_2.values.reshape(1, -1))

In [72]:
item_condition = np.array(df.item_condition_id.values.reshape(1, -1))

In [73]:
item_condition

array([[3, 3, 1, ..., 2, 3, 1]])

In [69]:
vector = np.hstack((name,
                   item_description,
                   main_cat.reshape(-1,1),
                   sub_cat_1.reshape(-1,1),
                   sub_cat_1.reshape(-1,1),
                   item_condition))

In [110]:
X = pd.DataFrame([name.data, item_description.data, main_cat.reshape(-1,1).data, sub_cat_1.reshape(-1,1).data, sub_cat_1.reshape(-1,1).data])

KeyboardInterrupt: ignored

In [97]:
X.shape

(5,)

In [58]:
1476204*7

10333428

In [56]:
df.item_condition_id.shape

(1476204,)

In [55]:
vector

array([<1476204x50000 sparse matrix of type '<class 'numpy.float64'>'
       	with 8118551 stored elements in Compressed Sparse Row format>,
       <1476204x50000 sparse matrix of type '<class 'numpy.float64'>'
       	with 33466180 stored elements in Compressed Sparse Row format>,
       <1476204x1 sparse matrix of type '<class 'numpy.float64'>'
       	with 1476204 stored elements in COOrdinate format>       , ..., 0,
       1, 0], dtype=object)

In [None]:
# np.save(f'{base_folder}/train_vector_cnn', vector)

In [80]:
from sklearn.model_selection import train_test_split

In [81]:
y = df['price']

In [101]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

ValueError: ignored

In [None]:
from sklearn import svm

In [None]:
regr = svm.SVR()
regr.fit(X_train, y_train)

In [None]:
y_pred = regr.predict(X_val)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
rmse = mean_squared_error()
rmse(y_val, y_pred)