In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import csv
import re
import string

In [21]:
with open('IDHSD_RIO_unbalanced_713_2017.txt') as f:
    reader = csv.reader(f, delimiter='\t')
    data = [(col1, col4)
                for col1, col4 in reader]


In [22]:
label = []
tweet = []
for i in range (1,len(data)) :
    label.append(data[i][0])
    tweet.append(data[i][1])
    
df = pd.DataFrame(list(zip(tweet, label)), 
               columns =['Tweet', 'Label']) 
df.head()

Unnamed: 0,Tweet,Label
0,RT @spardaxyz: Fadli Zon Minta Mendagri Segera...,Non_HS
1,RT @baguscondromowo: Mereka terus melukai aksi...,Non_HS
2,Sylvi: bagaimana gurbernur melakukan kekerasan...,Non_HS
3,"Ahmad Dhani Tak Puas Debat Pilkada, Masalah Ja...",Non_HS
4,RT @lisdaulay28: Waspada KTP palsu.....kawal P...,Non_HS


In [23]:
df['Label'].value_counts() 

Non_HS    453
HS        260
Name: Label, dtype: int64

# Data Cleaning

In [24]:
tweets = []
for tweet in df['Tweet'] :
    tweet = tweet.lower()
    tweet = re.sub(r"(?:\@|https?\://)\S+", "", tweet)
    tweet = re.sub(r"http\S+", "", tweet)
    tweet = re.sub('\n', '', tweet)
    tweet = re.sub('rt', '', tweet)
    tweet = re.sub("[^a-zA-Z^']", " ", tweet)
    tweet = re.sub(" {2,}", " ", tweet)
    tweet = tweet.strip()
    tweets.append(tweet)
df['clean_tweet'] = tweets

In [25]:
df.head()

Unnamed: 0,Tweet,Label,clean_tweet
0,RT @spardaxyz: Fadli Zon Minta Mendagri Segera...,Non_HS,fadli zon minta mendagri segera menonaktifkan ...
1,RT @baguscondromowo: Mereka terus melukai aksi...,Non_HS,mereka terus melukai aksi dalam rangka memenja...
2,Sylvi: bagaimana gurbernur melakukan kekerasan...,Non_HS,sylvi bagaimana gurbernur melakukan kekerasan ...
3,"Ahmad Dhani Tak Puas Debat Pilkada, Masalah Ja...",Non_HS,ahmad dhani tak puas debat pilkada masalah jal...
4,RT @lisdaulay28: Waspada KTP palsu.....kawal P...,Non_HS,waspada ktp palsu kawal pilkada


# Stemming

In [26]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem(tweet) :
    hasil = stemmer.stem(tweet)
    return hasil

df['clean_tweet'] = df.apply(lambda row : stem(row['clean_tweet']), axis = 1)

# Stopwords Removal

In [27]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
# from nltk.tokenize import word_tokenize

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

def stopwords(tweet) :
    tweet = tweet.translate(str.maketrans('','',string.punctuation)).lower()
    return stopword.remove(tweet)

df['clean_tweet'] = df.apply(lambda row : stem(row['clean_tweet']), axis = 1)

In [28]:
df.head()

Unnamed: 0,Tweet,Label,clean_tweet
0,RT @spardaxyz: Fadli Zon Minta Mendagri Segera...,Non_HS,fadli zon minta mendagri segera nonaktif ahok ...
1,RT @baguscondromowo: Mereka terus melukai aksi...,Non_HS,mereka terus luka aksi dalam rangka penjara ah...
2,Sylvi: bagaimana gurbernur melakukan kekerasan...,Non_HS,sylvi bagaimana gurbernur laku keras perempuan...
3,"Ahmad Dhani Tak Puas Debat Pilkada, Masalah Ja...",Non_HS,ahmad dhani tak puas debat pilkada masalah jal...
4,RT @lisdaulay28: Waspada KTP palsu.....kawal P...,Non_HS,waspada ktp palsu kawal pilkada


In [31]:
df.to_csv('data_tweet_clean.csv', index=False) 