In [8]:
import re
import numpy as np
import pandas as pd
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import TweetTokenizer

# cleaning + lowercase
def cleanlower(ulasan):
  ulasan = ulasan.str.replace(r"[^a-zA-Z]", ' ', regex=True)
  ulasan = ulasan.str.lower()
  return ulasan

# stemming
stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer() 
def stemming(ulasan):
  ulasan = ulasan.apply(lambda x: stemmer.stem(x))
  return ulasan

#tokenizing
def tokenizing(ulasan):
  tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
  ulasan = ulasan.apply(lambda x: tokenizer.tokenize(x))
  return ulasan

#stopword
# string_khusus = ['id','ulasan']
url = 'https://raw.githubusercontent.com/appermana1/data/master/stopword.csv'
stopword_import = pd.read_csv(url)
stopword_custom = stopword_import['0']
stopword_custom = stopword_custom.to_numpy()
# print(stopword_custom)

def stopwords(ulasan):
    output= " ".join([i for i in ulasan if i not in stopword_custom])
    return output

def preprocessing(ulasan):
  b = cleanlower(ulasan)
  c = stemming(b)
  d = tokenizing(c)
  e = d.apply(lambda x:stopwords(x))
  return e

#Split
def split(ulasan):
  ulasan = ulasan.str.split()
  return ulasan

def unique_term(word):
  world = ''
  for a in word:
    world = np.union1d(a, world)
  return world

def bag_of_word(ulasan):

  from sklearn.feature_extraction.text import CountVectorizer
  coun_vect = CountVectorizer()
  count_matrix = coun_vect.fit_transform(ulasan)
  count_array = count_matrix.toarray()
  df_bow = pd.DataFrame(data=count_array,columns = coun_vect.get_feature_names())
  max_value = np.max(count_array)

  bow = dict()
  bow['max'] = max_value
  bow['bow'] = df_bow
  
  return bow

def fix_preprocessing(df_ulasan):
  prp = preprocessing(df_ulasan)
  df_bow = bag_of_word(prp)
  cek_term = unique_term(tokenizing(prp))
  return df_bow


def normalize(bow, max_value_count):
  for b in range(1, max_value_count+1):
    for a in range(0, len(bow.columns)):
      bow.iloc[:, [a]] = bow.iloc[:, [a]].replace(to_replace=b,value=1)
  return bow



def chi_label(df_bow, df_bow_label_column, var_label, term):
  data_bow = df_bow[df_bow_label_column.isin([var_label])]
  try:
      #print(term,',',var_label)
      jml_a = data_bow[term].value_counts()[1.0]
  except KeyError:
      jml_a = 0
  try:
      jml_c = data_bow[term].value_counts()[0.0]
  except KeyError:
      jml_c = 0
  data_bow2 = df_bow[df_bow_label_column.ne(var_label)]
  try:
      jml_b = data_bow2[term].value_counts()[1.0]
  except KeyError:
      jml_b = 0
  try:
      jml_d = data_bow2[term].value_counts()[0.0]
  except KeyError:
      jml_d = 0
  #rumus 3.1
  chi_label = (jml_a*jml_d - jml_c*jml_b)**2/((jml_a + jml_c)*(jml_b + jml_d)*(jml_a + jml_b)*(jml_c + jml_d))
  return chi_label



def fix_chi_square(bow, max_value_count, df_label):
  df_normal = normalize(bow,max_value_count)
  df_process = pd.concat([df_label,df_normal], axis=1)
  count_label = df_label.value_counts() #no index here 
  term = df_normal.columns
  df_chi_square = pd.DataFrame({'term': []})
  for x in term:
    chi_total = 0 #inisiasi variabel harus disini
    for y in range(0,len(count_label)):
      label = count_label.index.tolist()[y]
      chi_label_value = chi_label(df_process,df_process.iloc[:,0],label, x)     
      #rumus 3.2
      chi_total += chi_label_value
    df_chi_square =df_chi_square.append({'term':x, 'Xtot':chi_total}, ignore_index=True)
  df_chi_square['rank'] = df_chi_square['Xtot'].rank(ascending=False, method="first")
  df_chi_square_rank = df_chi_square.sort_values(by='rank', ignore_index=True)
  return df_chi_square_rank



def prob_prior(df_label):
  total_ulasan = df_label.count()
  count_label = df_label.value_counts() #no index here
  df_prior = pd.DataFrame({'label': []})
  for x in range(0,len(count_label)):
    label = count_label.index.tolist()[x]
    jml_p = count_label[label]
    #rumus 3.3
    prior = jml_p/total_ulasan
    df_prior =df_prior.append({'label':label, 'prior':prior}, ignore_index=True)
  return df_prior



def transform_freq_table(df_label, bow):
  df_process = pd.concat([df_label, bow], axis=1)
  count_label = df_process.iloc[:,0].value_counts() #no index here
  term = df_process.columns
  df_prob_term = pd.DataFrame({'term': []})
  for x in bow:
    dict_prob = dict() #inisiasi dict harus disini
    for y in range(0,len(count_label)):
      label = count_label.index.tolist()[y]
      data_bow = df_process[df_process.iloc[:,0].isin([label])]
      jml = data_bow[x].sum()
      dict_prob[label] = jml
    dict_prob['term']=x #input dict harus disini
    df_prob_term = df_prob_term.append(dict_prob, ignore_index=True) #append dict harus disini
  return df_prob_term



def prob_likelihood(df_label_prior,df_term,df_freq_table):
  jml_all_term = 0
  for p in df_label_prior:
    jml = df_freq_table[p].agg('sum')
    jml_all_term += jml
  df_pct = pd.DataFrame({'term': []})
  for x in df_term:
    dict_pct = dict()
    # term = df_freq_table.iloc[x, 0] #there is an index here
    for y in df_label_prior:
      freq_c = df_freq_table[y].agg('sum')
      search_term = df_freq_table.loc[(df_freq_table['term']==x)]
      for z in search_term[y]:
        freq_tc = z
      # print(freq_tc)
      #rumus 3.4
      pct_value = (freq_tc+1)/(freq_c+jml_all_term)
      dict_pct[y] = pct_value
    dict_pct['term']=x
    df_pct = df_pct.append(dict_pct, ignore_index=True)
  return df_pct



def validate_test_bow(test_bow,df_likelihood_term):
  df_validate= pd.DataFrame()  
  for a in test_bow:
    for term in df_likelihood_term:
      if a == term:
        df_validate[a]=test_bow[a]
  return df_validate


def prob_posterior_doc(df_prior,prior_label,max_values_test_bow,validate_test_bow,df_likelihood,df_likelihood_term):
  doc_prob = pd.DataFrame()
  for label in prior_label:
    dict_replace_value = dict()
    dict_replace_value = validate_test_bow.copy()
    for term in validate_test_bow:
      get_term = df_likelihood[df_likelihood_term==term]
      get_value = get_term[label]
      for c in get_value: 
        dict_replace_value[term] = dict_replace_value[term]*c
    likelihood = dict_replace_value.sum(axis=1)
    get_prior_value = df_prior[prior_label==label]
    for prior in get_prior_value['prior']:

      #rumus 3.7
      doc_prob[label] = prior*likelihood
      
  return doc_prob


def predict_doc(prob_doc,df_test_ulasan,df_test, validate_bow):
  df_pred = pd.DataFrame()
  # print(prob_doc)
  max_values = prob_doc.max(axis = 1)
  df_pred = pd.concat([df_test,prob_doc], axis=1)
  for i in range(0,len(df_test_ulasan)):
    df_pred['pred']=0.0
  
  for i in range(0,len(df_test_ulasan)):
    if max_values[i]!=0:
      for x in prob_doc:
        if prob_doc[x][i]==max_values[i]:
          df_pred['pred'][i] = x #kolom pred harus inisiasi jml baris
          # print(x)
  return df_pred




def train_dataset(df_train,used_feature_selection, chi_square):

  v_limit = chi_square['rank'].quantile(used_feature_selection)

  selection = chi_square[chi_square['rank'] <= v_limit]

  bow = fix_preprocessing(df_train['ulasan'])

  df_prob_prior = prob_prior(df_train['GT'])

  pivot_table = transform_freq_table(df_train['GT'],bow['bow'])

  df_prob_likelihood = prob_likelihood(df_prob_prior['label'],selection['term'],pivot_table)

  train = dict()
  train['df_prior'] = df_prob_prior
  train['df_likelihood'] = df_prob_likelihood
  train['df_rank'] = selection

  return train


def predict_dataset(df_test,df_prior,df_likelihood):
  test_bow = fix_preprocessing(df_test['ulasan'])

  test_validate = validate_test_bow(test_bow['bow'],df_likelihood['term'])

  prob_doc = prob_posterior_doc(df_prior,df_prior['label'],test_bow['max'],test_validate,df_likelihood,df_likelihood['term'])

  pred = predict_doc(prob_doc,df_test['ulasan'],df_test,test_validate)

  test = dict()
  test['pred'] = pred
  test['validate'] = test_validate

  return test



def performance_test(y_test, y_pred):
  #importing confusion matrix
  from sklearn.metrics import confusion_matrix
  confusion = confusion_matrix(y_test, y_pred)
  print('Confusion Matrix\n')
  print(confusion)

  #importing accuracy_score, precision_score, recall_score, f1_score
  from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
  print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

  print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
  print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
  print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

  print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
  print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
  print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

  print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
  print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
  print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

  from sklearn.metrics import classification_report
  print('\nClassification Report\n')
  print(classification_report(y_test, y_pred))

In [9]:
from sklearn.model_selection import train_test_split

def get_min_required_rows(test_size):
    return 1 / test_size

def make_stratified_splits(df, y_col, test_size):
    """
        for any class with rows less than min_required_rows corresponding to the input test_size,
        all the rows associated with the specific class will have a copy in both the train and test splits.
        
        example: if test_size is 0.2 (20% otherwise),
        min_required_rows = 5 (which is obtained from 1 / test_size i.e., 1 / 0.2)
        where the resulting splits will have 4 train rows (80%), 1 test row (20%)..
    """
    
    id_col = "no"
    temp_col = "same-class-rows"
    
    class_to_counts = df[y_col].value_counts()
    df[temp_col] = df[y_col].apply(lambda y: class_to_counts[y])
    
    min_required_rows = get_min_required_rows(test_size)
    copy_rows = df[df[temp_col] < min_required_rows].copy(deep=True)
    valid_rows = df[df[temp_col] >= min_required_rows].copy(deep=True)
    
    X = valid_rows[id_col].tolist()
    y = valid_rows[y_col].tolist()
    
    # notice, this train_test_split is a stratified split
    X_train, X_test, _, _ = train_test_split(X, y, test_size=test_size, random_state=1, stratify=y)
    
    # X_test = X_test + copy_rows[id_col].tolist()
    X_train = X_train + copy_rows[id_col].tolist()
    
    df.drop([temp_col], axis=1, inplace=True)
    
    test_df = df[df[id_col].isin(X_test)].copy(deep=True)
    train_df = df[df[id_col].isin(X_train)].copy(deep=True)
    
    print (f"number of rows in the original dataset: {len(df)}")
    
    test_prop = round(len(test_df) / len(df) * 100, 2)
    train_prop = round(len(train_df) / len(df) * 100, 2)
    print (f"number of rows in the splits: {len(train_df)} ({train_prop}%), {len(test_df)} ({test_prop}%)")

    dfy = dict()
    dfy['train'] = train_df
    dfy['test'] = test_df
    
    return dfy

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('data/dataset.csv')
dataset['split'] = dataset['entity'].astype(str) + "_" + dataset['GT'].astype(str)

coba = make_stratified_splits(dataset, y_col="split", test_size=0.2)

number of rows in the original dataset: 3000
number of rows in the splits: 2415 (80.5%), 585 (19.5%)


In [11]:
df_train = coba['train']
df_test = coba['test'][['no','entity','ulasan','GT']]
df_test = df_test.reset_index(drop=True)

In [12]:
new_bow = fix_preprocessing(dataset['ulasan'])
chi_square = fix_chi_square(new_bow['bow'],new_bow['max'], df_train['GT'])

train_data = train_dataset(dataset,1,chi_square)
df_prior = train_data['df_prior']
df_likelihood = train_data['df_likelihood']
pd.to_pickle(train_data,r'train_model.pickle')

In [None]:
df_prior

In [6]:
prediksi = predict_dataset(df_test,df_prior,df_likelihood)
df_prediksi = prediksi['pred']
df_prediksi = df_prediksi[df_prediksi['pred']!=0]
hasil = performance_test(df_prediksi['GT'],df_prediksi['pred'])

Confusion Matrix

[[446   0   0]
 [ 66   0   0]
 [ 63   0   0]]

Accuracy: 0.78

Micro Precision: 0.78
Micro Recall: 0.78
Micro F1-score: 0.78

Macro Precision: 0.26
Macro Recall: 0.33
Macro F1-score: 0.29

Weighted Precision: 0.60
Weighted Recall: 0.78
Weighted F1-score: 0.68

Classification Report

              precision    recall  f1-score   support

           D       0.78      1.00      0.87       446
           N       0.00      0.00      0.00        66
          TD       0.00      0.00      0.00        63

    accuracy                           0.78       575
   macro avg       0.26      0.33      0.29       575
weighted avg       0.60      0.78      0.68       575



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pred['pred'][i] = x #kolom pred harus inisiasi jml baris
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
www3 = pd.read_pickle(r'train_model.pickle')

In [8]:
www3['df_likelihood']

Unnamed: 0,label,prior
0,D,0.747412
1,N,0.132505
2,TD,0.120083
