In [5]:
import os
import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns # used for plot interactive graph. 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import re
import string
#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

## Load Data and preprocess

In [6]:
df = pd.read_csv('homework_train.csv')

In [7]:
def text_cleaning(text:str)->str:
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    # text = re.sub('\w*\d\w*', '', text)
    return text

def price_to_int(text:str):
    text = text.replace('Rp','').replace('.','')
    return int(text)

In [8]:
df['text_cleaned'] = df['text'].apply(lambda x:text_cleaning(x))
df['price_int'] = df['price'].apply(lambda x:price_to_int(x))

In [9]:
df_cleaned = df[['text_cleaned','price_int','type_']]

## use ML model

In [10]:
X = df_cleaned[['text_cleaned','price_int']] # Collection of documents
y = df_cleaned['type_'] # Target or the labels we want to predict (i.e., the 13 different complaints of products)
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)


In [11]:
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5,
                        max_features=1000)

fitted_vectorizer = tfidf.fit(X_train['text_cleaned'])
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train['text_cleaned']).toarray()

In [12]:
model = LogisticRegression().fit(tfidf_vectorizer_vectors, y_train)

## Check model

In [None]:
from sklearn.metrics import classification_report

tfidf_vectorizer_vectors_test = fitted_vectorizer.transform(X_val['text_cleaned']).toarray()
y_pred = model.predict(tfidf_vectorizer_vectors_test)
print(classification_report(y_val, y_pred, target_names=model.classes_))

                              precision    recall  f1-score   support

      aksesoris-game-console       0.83      0.74      0.78      1229
         aksesoris-handphone       0.59      0.71      0.64       981
            aksesoris-kamera       0.74      0.67      0.70      1273
   aksesoris-komputer-laptop       0.72      0.69      0.70      1200
     aksesoris-mobile-gaming       0.81      0.78      0.80      1262
         aksesoris-pc-gaming       0.84      0.82      0.83      1188
            aksesoris-tablet       0.74      0.63      0.68      1052
      alat-pendingin-ruangan       0.89      0.89      0.89      1345
                       audio       0.73      0.74      0.74      1243
      baterai-charger-kamera       0.85      0.88      0.86      1204
                     cd-game       0.80      0.86      0.83      1269
       cleaning-tools-kamera       0.92      0.85      0.89      1264
             desktop-mini-pc       0.83      0.79      0.81      1279
            elektro

## Predict on test data

In [None]:
test_data = pd.read_csv('homework_test.csv')

In [None]:
def text_cleaning(text:str)->str:
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    # text = re.sub('\w*\d\w*', '', text)
    return text

def price_to_int(text:str):
    text = text.replace('Rp','').replace('.','')
    return int(text)

In [None]:
test_data['text_cleaned'] = test_data['text'].apply(lambda x:text_cleaning(x))
test_data['price_int'] = test_data['price'].apply(lambda x:price_to_int(x))

In [None]:
test_data = test_data[['index','text_cleaned']] 

In [None]:
tfidf_vectorizer_vectors_test = fitted_vectorizer.transform(test_data['text_cleaned']).toarray()
y_pred = model.predict(tfidf_vectorizer_vectors_test)

In [None]:
test_data['pred'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['pred'] = y_pred


In [None]:
test_data[['index','pred']].to_csv('result_submission.csv',index=False)