# **Library**

In [1]:
import pandas as pd
import re
import numpy as np
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, precision_score, precision_recall_curve, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pickle

# **Train Fungsi 3 (Classify Song Lyric)**

In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

idn_stopwords = set(stopwords.words('indonesian'))
eng_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
filtering = set(idn_stopwords)
filtering.update(eng_stopwords)

In [4]:
filtering

{'a',
 'about',
 'above',
 'ada',
 'adalah',
 'adanya',
 'adapun',
 'after',
 'again',
 'against',
 'agak',
 'agaknya',
 'agar',
 'ain',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'all',
 'am',
 'amat',
 'amatlah',
 'an',
 'and',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'any',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'are',
 'aren',
 "aren't",
 'artinya',
 'as',
 'asal',
 'asalkan',
 'at',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'be',
 'beberapa',
 'because',
 'been',
 'before',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'being',
 'bekerja',
 'belakang',
 'belakangan',
 'below',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 

In [5]:
len(filtering)

936

In [6]:
df1 = pd.read_csv("dataset/songs lyric.csv")
df1 = df1[['artist','song','lirik','Label']]
df1 = df1.loc[df1['Label'] != 'no match']
#remove'\n' from the lyrics
re_drop = re.compile(r'\n')        
df1[['lirik']] = df1[['lirik']].applymap(lambda x:re_drop.sub(' ',x))

df1

Unnamed: 0,artist,song,lirik,Label
0,Yura Yunita,Cinta dan Rahasia,Terakhir kutatap mata indahmu Di bawah bintang...,False
1,Kaleb J,Now I know,Aku tak menyadari kau t'lah menaruh hati Kepad...,False
2,Azmi,Pernah,Ada apa kau bertemu dia Mungkinkah kau ingin b...,False
3,Tulus,Pamit,Tubuh saling bersandar Ke arah mata angin berb...,False
4,Sheila on 7,Anugerah Terindah,Melihat tawamu Mendengar senandungmu Terlihat ...,False
...,...,...,...,...
127,Young Lex,Plastik,Alah paling kontroversi lagi ni Pansos lagi sa...,True
128,Achmad Sawadi,Lelaki Kardus,Bapakku kawin lagi Aku ditinggalin Aku sakit h...,True
129,The Panas Dalam,Seperti Seekor Babi,Rambutnya tipis jadi gitaris Seperti seekor ba...,True
130,Anjar Ox's,Ngacca Dulu,"Pembenci menghina, gua lawan tertawa Lu mau ka...",True


In [7]:
df2 = pd.read_csv("dataset/subsongdata_57650.csv")
df2 = df2[['artist','song','text','explicit_label']]
df2 = df2.loc[df2['explicit_label'] != 'no match']
#remove'\n' from the lyrics
re_drop = re.compile(r'\n')        
df2[['text']] = df2[['text']].applymap(lambda x:re_drop.sub(' ',x))
df2.rename(columns = {"text": "lirik", "explicit_label": "Label"}, inplace=True)

df2

Unnamed: 0,artist,song,lirik,Label
1,ABBA,"Andante, Andante","Take it easy with me, please Touch me gently...",False
2,ABBA,As Good As New,I'll never know why I had to go Why I had to...,False
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,False
7,ABBA,Chiquitita,"Chiquitita, tell me what's wrong You're ench...",False
11,ABBA,Dancing Queen,"You can dance, you can jive, having the time o...",False
...,...,...,...,...
57593,Zao,To Think Of You Is To Treasure An Absent Memory,When you shut your eyes and fell asleep Dark...,False
57605,Zebra,As I Said Before,And I said before I don't want no more And...,False
57608,Zebra,Hard Living Without You,Nothing to say no place to hide I can't find...,False
57609,Zebra,When You Get There,You wake up in the morning And you're not fe...,False


In [8]:
song_df = pd.merge(df1,df2,how="outer")

In [9]:
song_df

Unnamed: 0,artist,song,lirik,Label
0,Yura Yunita,Cinta dan Rahasia,Terakhir kutatap mata indahmu Di bawah bintang...,False
1,Kaleb J,Now I know,Aku tak menyadari kau t'lah menaruh hati Kepad...,False
2,Azmi,Pernah,Ada apa kau bertemu dia Mungkinkah kau ingin b...,False
3,Tulus,Pamit,Tubuh saling bersandar Ke arah mata angin berb...,False
4,Sheila on 7,Anugerah Terindah,Melihat tawamu Mendengar senandungmu Terlihat ...,False
...,...,...,...,...
24803,Zao,To Think Of You Is To Treasure An Absent Memory,When you shut your eyes and fell asleep Dark...,False
24804,Zebra,As I Said Before,And I said before I don't want no more And...,False
24805,Zebra,Hard Living Without You,Nothing to say no place to hide I can't find...,False
24806,Zebra,When You Get There,You wake up in the morning And you're not fe...,False


In [10]:
#fungsi untuk menghapus semua karakter non-alfabet
def clean(text):
  text = re.sub('[^A-Za-z]+', ' ', text)
  return text
#lowercase
def casefolding(tweet):
  tweet = tweet.lower()
  tweet = tweet.strip(" ")
  tweet = re.sub(r'[?|$|.|!²_:")(-+.]','',tweet)
  return tweet

song_df['lirik'] = song_df['lirik'].apply(clean)
song_df['lirik'] = song_df['lirik'].apply(casefolding)
song_df

Unnamed: 0,artist,song,lirik,Label
0,Yura Yunita,Cinta dan Rahasia,terakhir kutatap mata indahmu di bawah bintang...,False
1,Kaleb J,Now I know,aku tak menyadari kau t lah menaruh hati kepad...,False
2,Azmi,Pernah,ada apa kau bertemu dia mungkinkah kau ingin b...,False
3,Tulus,Pamit,tubuh saling bersandar ke arah mata angin berb...,False
4,Sheila on 7,Anugerah Terindah,melihat tawamu mendengar senandungmu terlihat ...,False
...,...,...,...,...
24803,Zao,To Think Of You Is To Treasure An Absent Memory,when you shut your eyes and fell asleep dark c...,False
24804,Zebra,As I Said Before,and i said before i don t want no more and i c...,False
24805,Zebra,Hard Living Without You,nothing to say no place to hide i can t find a...,False
24806,Zebra,When You Get There,you wake up in the morning and you re not feel...,False


In [11]:
for i in range(song_df.shape[0]):
    l = song_df['Label'][i]
    if l==False:
      l = 'False'
    elif l==True :
      l = 'True'
    song_df['Label'][i] = l

In [12]:
song_df['Label'].values

array(['False', 'False', 'False', ..., 'False', 'False', 'False'],
      dtype=object)

In [13]:
song_df[(song_df['Label']=='False')].shape

(23418, 4)

In [14]:
song_df[(song_df['Label']=='True')].shape

(1390, 4)

In [15]:
song_df_1 = song_df.loc[song_df['Label'] == 'True']
song_df_0 = song_df.loc[song_df['Label'] == 'False']
song_df_0 = song_df_0.sample(n=23418, replace=False, random_state=100)

x = song_df_0[['artist','song','lirik']].append(song_df_1[['artist','song','lirik']])
y = song_df_0[['Label']].append(song_df_1[['Label']])

#train : test = 8 : 2
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=4961, random_state=100)

In [16]:
x_train

Unnamed: 0,artist,song,lirik
2990,George Strait,If You Ain't Lovin' (You Ain't Livin'),if you got a cadillac boy and a room shack boy...
17846,Little Mix,Secret Love,when you hold me in the street and you kiss me...
4076,John Martyn,Hole In The Rain,between the drizzle and the drop between the d...
13574,Eric Clapton,Knockin' On Heaven's Door,ma take this badge off of me i can t use it an...
15196,Hanson,Tearing It Down,i am taking a chance walking with my laces loo...
...,...,...,...
14149,Freddie Aguilar,Anak Pawis,anak pawis ang tawag sa akin ako raw ay basaha...
18919,Misfits,Spinal Remains,this isn t really death this isn t really life...
16749,Judas Priest,Living After Midnight,living after midnight rockin to the dawn lovin...
11557,Carpenters,Piano Picker,everybody always asks me how i got to play so ...


In [17]:
y_train

Unnamed: 0,Label
2990,False
17846,False
4076,False
13574,False
15196,False
...,...
14149,False
18919,True
16749,False
11557,False


In [18]:
x_test

Unnamed: 0,artist,song,lirik
21708,Roy Orbison,Indian Wedding,there once was an indian brave by the name of ...
11008,Blur,Young And Lovely,friday s child is planning to out for the firs...
17529,Kris Kristofferson,Shipwrecked In The Eighties,well you fight like the devil to just keep you...
7995,Steve Miller Band,Lovin' Cup,my mama she done told me soon you be a man and...
24798,Zao,All Else Failed,a throne in heaven sat empty for years why for...
...,...,...,...
18627,Metallica,Am I Evil?,my mother was a witch she was burned alive tha...
14498,George Strait,Good News Bad News,i ve got some good news can t wait to tell you...
1865,Dolly Parton,Home For Pete's Sake,i became a woman of the world cause i was fed ...
13780,Faith Hill,When The Lights Go Down,when the lights go down he ll be fillin a pan ...


In [19]:
y_test

Unnamed: 0,Label
21708,False
11008,False
17529,False
7995,False
24798,False
...,...
18627,False
14498,False
1865,False
13780,False


In [20]:
train_label = []
for i in range(len(y_train)):
    l = y_train.iloc[i,0]
    if l=='False':
      l = 0
    else :
      l = 1
    train_label.append(l)

test_label = []
for i in range(len(y_test)):
    l = y_test.iloc[i,0]
    if l=='False':
      l = 0
    else:
      l = 1
    test_label.append(l)
    
train_data = []
for i in range(len(x_train)):
    text = x_train.iloc[i,2]
    train_data.append(text)
    
test_data = []
for i in range(len(x_test)):
    text = x_test.iloc[i,2]
    test_data.append(text)

In [21]:
type(test_data)

list

**Custom Feature**

In [22]:
file1 = open('dataset/indonesian.csv','r')
file2 = open('dataset/badwords.txt','r')
file1 = list(file1)
file2 = list(file2)

In [23]:
bad_words= []
for w in file1:
    bad_words.append(re.sub(r'\n','',w))
for w in file2:
    bad_words.append(re.sub(r'\n','',w))

In [24]:
bad_words

['adult',
 'akouka',
 'alkohol',
 'anak haram',
 'anak yatim',
 'analex',
 'anjing',
 'anjink',
 'anjir',
 'arsundal',
 'asu',
 'autis',
 'azizay',
 'babi',
 'babi lu',
 'bacot',
 'bajingan',
 'bajingan tengik',
 'bakka',
 'banci',
 'bandar',
 'bangke',
 'bangsat',
 'bawel',
 'bebon',
 'bedebah',
 'bedon',
 'beer',
 'bego',
 'begok',
 'bencong',
 'berak',
 'bercinta',
 'berengsek',
 'bersetubuh',
 'bestiality',
 'betting',
 'biadab',
 'bispak',
 'bitch',
 "blo'on",
 'blowjob',
 'boâ€™ol',
 'bodo',
 'bodoh',
 'bodooohhh',
 'bokep',
 'boker',
 'bokong',
 'borok',
 'bot',
 'breast',
 'brengsek',
 'brengsex',
 'brengsexxx',
 'buah dada',
 'buah zakar',
 'buaya',
 'buaya darat',
 'budeg',
 'bugil',
 'bujang inam',
 'Burung',
 'cabe lo',
 'cabo',
 'cacat',
 'cacat lu',
 'cangcut',
 'caper',
 'ccookkk',
 'cebong',
 'celaka',
 'celeng',
 'celsit',
 'centil',
 'chealeng',
 'chealshit',
 'cheleng',
 'cheleng tikus',
 'chellenk',
 'chelshit',
 'chleng',
 'cilaka',
 'cina',
 'citikus',
 'citykus',

In [25]:
len(bad_words)

814

In [26]:
def get_bad_words(review):
    target_word = bad_words
    count = 0
    threshold = 0
    for t in target_word:
        if review.find(t) != -1:
            count += 1
    return count > threshold

def get_num_words(review):
    threshold = 0
    words = review.split(' ')
    count = len(list(words))
    return count > threshold

def find_bad_words(review,finded):
    target_word = bad_words
    count = 0
    finded = []
    for t in target_word:
        if review.find(t) != -1:
            finded.append(t)
    return finded

In [27]:
class CustomFeats(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feat_names = set()

    def fit(self, x, y=None):
        return self

    @staticmethod
    def features(review):
        return {
          'num_word': get_num_words(review),
          'bad_word': get_bad_words(review)
        }

    def get_feature_names(self):
        return list(self.feat_names)
      
    def transform(self, reviews):
        feats = []
        for review in reviews:
            f = self.features(review)
            [self.feat_names.add(k) for k in f] 
            feats.append(f)
        return feats

#feats = make_pipeline(CustomFeats(), DictVectorizer())
feats = FeatureUnion([
     ('custom', make_pipeline(CustomFeats(), DictVectorizer())),
     ('bag_of_words', TfidfVectorizer(stop_words=filtering))
 ])

**Klasifikasi**

In [28]:
train_vecs = feats.fit_transform(train_data)
test_vecs = feats.transform(test_data)



**Algoritma Random Forest**

In [29]:
# import random forest model
rf = RandomForestClassifier()
# train model
rf.fit(train_vecs, train_label)
# predict test dataset
test_preds = rf.predict(test_vecs)

acc = accuracy_score(test_label, test_preds)
print("Accuracy : ", acc, "\n")
cm = confusion_matrix(test_label, test_preds)
print("Confusion Matrix : \n", cm, " \n")
report = classification_report(test_label, test_preds)
print(report)

Accuracy :  0.9647248538601089 

Confusion Matrix : 
 [[4678    9]
 [ 166  108]]  

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4687
           1       0.92      0.39      0.55       274

    accuracy                           0.96      4961
   macro avg       0.94      0.70      0.77      4961
weighted avg       0.96      0.96      0.96      4961



In [30]:
data = rf
with open('rf.pkl', 'wb') as file:
    pickle.dump(data, file)

**Algoritma Klasifikasi KNN**

In [31]:
# import KNeighborsClassifier model
knn = KNeighborsClassifier(n_neighbors=10)
# train model
knn.fit(train_vecs, train_label)
# predict test dataset
test_preds = knn.predict(test_vecs)

acc = accuracy_score(test_label, test_preds)
print("Accuracy : ", acc, "\n")
cm = confusion_matrix(test_label, test_preds)
print("Confusion Matrix : \n", cm, " \n")
report = classification_report(test_label, test_preds)
print(report)

Accuracy :  0.9578713968957872 

Confusion Matrix : 
 [[4674   13]
 [ 196   78]]  

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      4687
           1       0.86      0.28      0.43       274

    accuracy                           0.96      4961
   macro avg       0.91      0.64      0.70      4961
weighted avg       0.95      0.96      0.95      4961



In [32]:
data = knn
with open('knn.pkl', 'wb') as file:
    pickle.dump(data, file)

**Algoritma Klasifikasi Decision Tree**

In [33]:
# import decision tree model
dt = DecisionTreeClassifier(min_samples_split=0.4, max_depth=77)
# train model
dt.fit(train_vecs, train_label)
# predict test dataset
test_preds = dt.predict(test_vecs)

acc = accuracy_score(test_label, test_preds)
print("Accuracy : ", acc, "\n")
cm = confusion_matrix(test_label, test_preds)
print("Confusion Matrix : \n", cm, " \n")
report = classification_report(test_label, test_preds)
print(report)

Accuracy :  0.9653295706510784 

Confusion Matrix : 
 [[4599   88]
 [  84  190]]  

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      4687
           1       0.68      0.69      0.69       274

    accuracy                           0.97      4961
   macro avg       0.83      0.84      0.84      4961
weighted avg       0.97      0.97      0.97      4961



In [34]:
data = dt
with open('dt.pkl', 'wb') as file:
    pickle.dump(data, file)

**Algoritma Klasifikasi SVM**

In [35]:
# import svm model
svm = SVC(C = 10000, kernel = 'rbf')
# train model
svm.fit(train_vecs, train_label)
# predict test dataset
test_preds = svm.predict(test_vecs)

acc = accuracy_score(test_label, test_preds)
print("Accuracy : ", acc, "\n")
cm = confusion_matrix(test_label, test_preds)
print("Confusion Matrix : \n", cm, " \n")
report = classification_report(test_label, test_preds)
print(report)

Accuracy :  0.9661358597057045 

Confusion Matrix : 
 [[4661   26]
 [ 142  132]]  

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      4687
           1       0.84      0.48      0.61       274

    accuracy                           0.97      4961
   macro avg       0.90      0.74      0.80      4961
weighted avg       0.96      0.97      0.96      4961



In [36]:
data = svm
with open('svm.pkl', 'wb') as file:
    pickle.dump(data, file)

# **Try to Classify**

In [37]:
# load model
with open('svm.pkl', 'rb') as file:
    data = pickle.load(file)

model = data

In [41]:
title = str(input('Judul Lagu : '))
lirik = str(input('Lirik Lagu : '))

finded = []
lirik = clean(lirik)
lirik = casefolding(lirik)
find = find_bad_words(lirik,finded)
lirik = [lirik]
lirik = feats.transform(lirik)
result = model.predict(lirik)

print(result)

if result == 0 :
    print("This song doesn't contain any badwords")
else :
    print("This song contains any badwords")
    print(find)

Judul Lagu : in
Lirik Lagu : jaya selalu


ValueError: Iterable over raw text documents expected, string object received.

In [40]:
title = str(input('Judul Lagu : '))
lirik = str(input('Lirik Lagu : '))

finded = []
lirik = clean(lirik)
lirik = casefolding(lirik)
find = find_bad_words(lirik,finded)
lirik = [lirik]
lirik2 = feats.transform(lirik)
result = model.predict(lirik2)

print(result)

if result == 0 :
    print("This song doesn't contain any badwords")
else :
    print("This song contains any badwords")
    print(find)

Judul Lagu : indonesia
Lirik Lagu : jaya selali
[0]
This song doesn't contain any badwords
