In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
df = pd.read_csv('data_tweet_clean.csv')
df.tail(30)

Unnamed: 0,Tweet,Label,clean_tweet
683,otak wc otak babi yg menerima ahok kafir & kam...,HS,otak wc otak babi yg terima ahok kafir kamu ah...
684,DALAM PUTARAN DUA NANTI JIKA SAMPAI AHOK KAFIR...,HS,dalam putar dua nanti jika sampai ahok kafir c...
685,#IKLANAHOKJAHAT AHOK MENCIPTAKAN BARA API ANTA...,HS,iklanahokjahat ahok cipta bara api antara raky...
686,jangan heran pemerintahan jokowi sekarang ini ...,HS,jangan heran perintah jokowi sekarang ini buda...
687,Ya itu namanya cawgub goblok gak ngerti peratu...,HS,ya itu nama cawgub goblok gak ngei atur debat ...
688,BODOH jika para pendukung ahok yg Muslim tidak...,HS,bodoh jika para dukung ahok yg muslim tidak si...
689,Gubraaaaak!!!! Malu liatnya... Untung semalem ...,HS,gubraaaaak malu liat untung semalem debat ga k...
690,YG JELAS OTAK AHOK-DJAROT SUDAH SANGAT TUMPUL ...,HS,yg jelas otak ahok djarot sudah sangat tumpul ...
691,AHOK KAFIR BUKAN HANYA MALING TAPI JUGA GARONG...,HS,ahok kafir bukan hanya maling tapi juga garong...
692,Sumpah bego bgt sih kesel gue. Kenapa ada org ...,HS,sumpah bego bgt sih kesel gue kenapa ada org y...


In [3]:
df.shape

(713, 3)

In [4]:
df['Label'].value_counts()

Non_HS    453
HS        260
Name: Label, dtype: int64

# Vectorization

In [5]:
X = df['clean_tweet']
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X)
X_train_counts.shape

(713, 2299)

In [6]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(713, 2299)

In [8]:
le = LabelEncoder()
y = le.fit_transform(df['Label'])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=0.2, random_state=0)

# Train model

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

mlp = MLPClassifier(random_state=1, max_iter=300)
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
accs = []
print("Report : \n", classification_report(y_test, y_pred))
print("Accuracy : ",accuracy_score(y_test,y_pred))
accs.append(accuracy_score(y_test, y_pred))

Report : 
               precision    recall  f1-score   support

           0       0.79      0.65      0.71        57
           1       0.79      0.88      0.84        86

    accuracy                           0.79       143
   macro avg       0.79      0.77      0.77       143
weighted avg       0.79      0.79      0.79       143

Accuracy :  0.7902097902097902


In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
print("Report : \n", classification_report(y_test, y_pred))
print("Accuracy : ",accuracy_score(y_test,y_pred))
accs.append(accuracy_score(y_test, y_pred))

Report : 
               precision    recall  f1-score   support

           0       0.79      0.65      0.71        57
           1       0.79      0.88      0.84        86

    accuracy                           0.79       143
   macro avg       0.79      0.77      0.77       143
weighted avg       0.79      0.79      0.79       143

Accuracy :  0.7902097902097902


In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=10, random_state=1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Report : \n", classification_report(y_test, y_pred))
print("Accuracy : ",accuracy_score(y_test,y_pred))
accs.append(accuracy_score(y_test, y_pred))

Report : 
               precision    recall  f1-score   support

           0       0.96      0.46      0.62        57
           1       0.73      0.99      0.84        86

    accuracy                           0.78       143
   macro avg       0.85      0.72      0.73       143
weighted avg       0.82      0.78      0.75       143

Accuracy :  0.7762237762237763


In [14]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("Report : \n", classification_report(y_test, y_pred))
print("Accuracy : ",accuracy_score(y_test,y_pred))
accs.append(accuracy_score(y_test, y_pred))

Report : 
               precision    recall  f1-score   support

           0       1.00      0.54      0.70        57
           1       0.77      1.00      0.87        86

    accuracy                           0.82       143
   macro avg       0.88      0.77      0.79       143
weighted avg       0.86      0.82      0.80       143

Accuracy :  0.8181818181818182


In [15]:
# Perbandingan Akurasi
models = ['Multi Layer Perceptron','K-Nearest Neighbor','Random Forest', 'Support Vector Machine']
result_df = pd.DataFrame(list(zip(models, accs)), columns =['Model', 'Accuracy']) 
result_df

Unnamed: 0,Model,Accuracy
0,Multi Layer Perceptron,0.79021
1,K-Nearest Neighbor,0.79021
2,Random Forest,0.776224
3,Support Vector Machine,0.818182


# Predict  new data

In [16]:
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

def stopwords(tweet) :
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    tweet = tweet.translate(str.maketrans('','',string.punctuation)).lower()
    return stopword.remove(tweet)

def stem(tweet) :
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    hasil = stemmer.stem(tweet)
    return hasil

def is_hate_speech(list_tweets) :
    new_tweet_clean = []
    for tweet in list_tweets :
        tweet = tweet.lower()
        tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet)
        tweet = re.sub(r"http\S+", "", tweet)
        tweet = re.sub('\n', '', tweet)
        tweet = re.sub('rt', '', tweet)
        tweet = re.sub("[^a-zA-Z^']", " ", tweet)
        tweet = re.sub(" {2,}", " ", tweet)
        tweet = tweet.strip()
        tweet = stem(tweet)
        tweet = stopwords(tweet)
        new_tweet_clean.append(tweet)
        
    X_new_counts = count_vect.transform(new_tweet_clean)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)

    predicted = svm.predict(X_new_tfidf)
    hasil = []
    for i in range(len(predicted)) :
        if (predicted[i] == 0) :
            hasil.append('Hate Speech')
        else :
            hasil.append('Non Hate Speech')

    return hasil

In [17]:
new_tweet = [
    'RT @spardaxyz: Fadli Zon Minta Mendagri Segera Menonaktifkan Ahok Jadi Gubernur DKI https:\\/\\/t.co\\/KH5vIRwPdO',
    'Kaga masuk diakal kite kalo suara seorang penista agama yg dah dikutuk ame umat islam suara nye bisa tinggi, emang sengaja ditinggiin tuh',
    'Siang ini saya buka jasa baca telapak tangan buat 20 slot saya yaa, biaya? Tetep seikhlasnyaa saja. Yang dibaca adalah karakteristik yang disadari atau tidak, keuangan, karir dan love life. Silakan dm buat book slotnyaa. Terima kasih 🙏',
    'Tuhan sudah menutup aibmu sedemikian rupa. Tapi kau malah membuka twitter dan memamerkannya. ;)',
    'Sekarang ada jeda istirahat dulu woi bangsat babi'
]

is_hate_speech(new_tweet)

['Non Hate Speech',
 'Hate Speech',
 'Non Hate Speech',
 'Non Hate Speech',
 'Hate Speech']