In [1]:
# Importing libraries 
import nltk
import pandas as pd
from googletrans import Translator
import contractions
from unidecode import unidecode
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
df1 = pd.read_csv("dataset_es_train.csv", on_bad_lines="skip" , engine="python")
df1.head(3)

Unnamed: 0,review_id,product_id,reviewer_id,stars,review_body,review_title,language,product_category
0,es_0491108,product_es_0296024,reviewer_es_0999081,1,Nada bueno se me fue ka pantalla en menos de 8...,television Nevir,es,electronics
1,es_0869872,product_es_0922286,reviewer_es_0216771,1,"Horrible, nos tuvimos que comprar otro porque ...",Dinero tirado a la basura con esta compra,es,electronics
2,es_0811721,product_es_0474543,reviewer_es_0929213,1,Te obligan a comprar dos unidades y te llega s...,solo llega una unidad cuando te obligan a comp...,es,drugstore


In [3]:
sample = df1.sample(1000)
sample.reset_index(drop=True,inplace=True)


In [4]:
data = sample[["review_body","stars"]]


In [5]:
def lang_trans(data):
    translator = Translator()
    translated_text = translator.translate(data)
    return translated_text.text


In [6]:
data["translated_reviews"] = data["review_body"].apply(lang_trans)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["translated_reviews"] = data["review_body"].apply(lang_trans)


In [7]:
data

Unnamed: 0,review_body,stars,translated_reviews
0,No merece la pena es preferible gastar algo má...,3,"It is not worth it, it is preferable to spend ..."
1,"Calidad inadecuada, se ajusta perfectamente al...",1,"Inadequate quality, it fits the phone perfectl..."
2,le doy 4 estrellas porque aunque venia bien em...,4,I give it 4 stars because although it came wel...
3,No vale para todos los volantes pero buena mat...,3,It is not suitable for all steering wheels but...
4,Muy buenos para tiradas cortas...caben en cual...,4,Very good for short runs...they fit on any run...
...,...,...,...
995,A mi hija de 6 años le ha encantado por las lu...,2,My 6-year-old daughter loved it because of the...
996,"Modelo original, pero calidad bastante mala. T...",2,"Original model, but quite poor quality. After ..."
997,Más grande de mi que ponía en la web,3,Biggest of me that I put on the web
998,"Son cómodas, pero no se puede estira mucho sin...",4,"They are comfortable, but you can't stretch th..."


In [8]:
data = data[["review_body","stars"]]

In [9]:
data["stars"].unique()

array([3, 1, 4, 5, 2], dtype=int64)

In [10]:
data["stars"].value_counts()

stars
5    213
2    210
3    201
4    197
1    179
Name: count, dtype: int64

In [11]:
# Remove Newlines 
def remove_newline(data):
    clean_text = data.replace("\\n"," ").replace("\n"," ").replace("\\t"," ")
    return clean_text

# Contraction mapping 
def expand_text(data):
    expanded_doc = contractions.fix(data)
    return expanded_doc

# handle accented characters
def accented_char(data):
    fixed_text = unidecode(data)
    return fixed_text

# Clean data 
stopwords_list = stopwords.words("english")
stopwords_list.remove("no")
stopwords_list.remove("nor")
stopwords_list.remove("not")

def clean_data(data):
    tokens = word_tokenize(data)
    clean_text = [word.lower() for word in tokens if (word not in punctuation) and (word.lower() not in stopwords_list) and (len(word)>2) and (word.isalpha())]
    return clean_text

# Lemmatization 
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    final_text = []
    for word in data:
        lemmatized_word = lemmatizer.lemmatize(word)
        final_text.append(lemmatized_word)
    return final_text

def join_list(data):
    return " ".join(data)

In [12]:
def target_column(data):
    if (data == 1) or (data == 2):
        return 0 
    if (data == 3):
        return 1
    if (data == 4) or (data == 5):
        return 2 
    
data["target"] = data["stars"].apply(target_column)


In [13]:
data["target"]

0      1
1      0
2      2
3      1
4      2
      ..
995    0
996    0
997    1
998    2
999    0
Name: target, Length: 1000, dtype: int64

In [14]:
data

Unnamed: 0,review_body,stars,target
0,No merece la pena es preferible gastar algo má...,3,1
1,"Calidad inadecuada, se ajusta perfectamente al...",1,0
2,le doy 4 estrellas porque aunque venia bien em...,4,2
3,No vale para todos los volantes pero buena mat...,3,1
4,Muy buenos para tiradas cortas...caben en cual...,4,2
...,...,...,...
995,A mi hija de 6 años le ha encantado por las lu...,2,0
996,"Modelo original, pero calidad bastante mala. T...",2,0
997,Más grande de mi que ponía en la web,3,1
998,"Son cómodas, pero no se puede estira mucho sin...",4,2


In [16]:
data["translated_reviews"] = data["review_body"].apply(lang_trans)

In [17]:
x_train , x_test , y_train , y_test = train_test_split(data["translated_reviews"],data["target"],test_size=0.25,random_state=42)

clean_text_train = x_train.apply(remove_newline)

clean_text_train = clean_text_train.apply(expand_text)

clean_text_train = clean_text_train.apply(accented_char)

clean_text_train = clean_text_train.apply(clean_data)

clean_text_train = clean_text_train.apply(lemmatization)

clean_text_train = clean_text_train.apply(join_list)


In [18]:
count = CountVectorizer(max_df=0.95 , max_features=1000)
count_train_val = count.fit_transform(clean_text_train)


In [19]:
mnb_count = MultinomialNB()
mnb_count.fit(count_train_val.A , y_train)


In [20]:
import pickle
pickle.dump(count , open("count2.pkl" , "wb"))


In [22]:
import pickle 
pickle.dump(mnb_count , open("model2.pkl", "wb"))
